[
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.yml",
    "content": "name: Report a bug\ndescription: Report a bug.\nlabels:\n  - \":bug: bug\"\nbody:\n  - type: textarea\n    id: report\n    attributes:\n      label: Add a description\n      placeholder: |\n        Describe and consider providing version information. Please ensure you're on the latest version of lamindb.\n        This is a public repository!\n        Do not reveal any internal information.\n    validations:\n      required: true\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "content": "blank_issues_enabled: true\ncontact_links:\n    - name: LaminHub issues\n      url: https://github.com/laminlabs/laminhub-public\n      about: If you have issues with the GUI/web app at lamin.ai, please report them here.\n    - name: Enterprise support\n      url: https://lamin.ai/contact\n      about: If you have other questions, contact us directly.\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/enhancement.yml",
    "content": "name: Propose an enhancement\ndescription: Propose an enhancement.\nbody:\n  - type: textarea\n    id: description\n    attributes:\n      label: Add a description\n      placeholder: |\n        This is a public repository!\n        Do not reveal any internal information.\n    validations:\n      required: true\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/usage_question.yml",
    "content": "name: Ask a usage question\ndescription: Ask a usage question.\nlabels:\n  - \"usage question\"\nbody:\n  - type: textarea\n    id: description\n    attributes:\n      label: Add a description\n      placeholder: |\n        This is a public repository!\n        Do not reveal any internal information.\n    validations:\n      required: true\n"
  },
  {
    "path": ".github/workflows/build.yml",
    "content": "name: build\n\non:\n  push:\n    branches: [release]\n  pull_request:\n\njobs:\n  pre-filter:\n    runs-on: ubuntu-latest\n    outputs:\n      matrix: ${{ steps.set-matrix.outputs.matrix }}\n    steps:\n      - uses: actions/checkout@v6\n        with:\n          fetch-depth: 0\n\n      - uses: dorny/paths-filter@v3\n        id: changes\n        if: github.event_name != 'push'\n        with:\n          filters: |\n            curator:\n              - 'lamindb/curators/**'\n              - 'lamindb/examples/cellxgene/**'\n              - 'tests/curators/**'\n            integrations:\n              - 'lamindb/integrations/**'\n              - 'tests/integrations/**'\n\n      - id: set-matrix\n        shell: bash\n        run: |\n          BASE_GROUPS=$(jq -n -c '[\"unit-core-sqlite\", \"unit-core-postgres\", \"unit-storage\", \"tutorial\", \"guide\", \"tiledbsoma\", \"biology\", \"faq\", \"storage\", \"cli\", \"permissions\", \"no-instance\"]')\n          ADDITIONAL_GROUPS=[]\n\n          if [[ \"${{ github.event_name }}\" == \"push\" || \"${{ github.event_name }}\" == \"repository_dispatch\" ]]; then\n            # Run everything on push and dispatch\n            ADDITIONAL_GROUPS=$(jq -n -c '[\"curator\", \"integrations\"]')\n          else\n            # Otherwise check which paths changed\n            if [[ \"${{ steps.changes.outputs.curator }}\" == \"true\" ]]; then\n              ADDITIONAL_GROUPS=$(jq -n -c --argjson groups \"$ADDITIONAL_GROUPS\" '$groups + [\"curator\"]')\n            fi\n            if [[ \"${{ steps.changes.outputs.integrations }}\" == \"true\" ]]; then\n              ADDITIONAL_GROUPS=$(jq -n -c --argjson groups \"$ADDITIONAL_GROUPS\" '$groups + [\"integrations\"]')\n            fi\n          fi\n\n          # Combine base groups with any additional groups\n          MATRIX=$(jq -n -c --argjson base \"$BASE_GROUPS\" --argjson additional \"$ADDITIONAL_GROUPS\" '{group: ($base + $additional)}')\n\n          # Output as single line for GitHub Actions\n          echo \"matrix=$(echo \"$MATRIX\" | jq -c .)\" >> $GITHUB_OUTPUT\n\n          # Pretty print for debugging\n          echo \"Generated matrix:\"\n          echo \"$MATRIX\" | jq .\n\n  test:\n    needs: pre-filter\n    runs-on: ubuntu-latest\n    strategy:\n      fail-fast: false\n      matrix: ${{fromJson(needs.pre-filter.outputs.matrix)}}\n    timeout-minutes: 20\n    steps:\n      - uses: actions/checkout@v6\n        with:\n          submodules: recursive\n          fetch-depth: 0\n\n      - uses: actions/checkout@v6\n        if: ${{ matrix.group == 'permissions' }}\n        with:\n          repository: laminlabs/laminhub\n          token: ${{ secrets.GH_TOKEN_DEPLOY_LAMINAPP }}\n          path: laminhub\n          ref: main\n\n      - uses: actions/setup-python@v6\n        with:\n          python-version: |\n            ${{ matrix.group == 'tiledbsoma' && '3.13' ||\n                matrix.group == 'permissions' && '3.14' ||\n                github.ref == 'refs/heads/release' && '3.11' ||\n                '3.14'\n                }}\n\n      - name: cache pre-commit\n        uses: actions/cache@v4\n        with:\n          path: ~/.cache/pre-commit\n          key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}\n\n      - name: cache postgres\n        if: ${{ matrix.group == 'faq' || matrix.group == 'unit-core-postgres' || matrix.group == 'unit-storage' || matrix.group == 'permissions'}}\n        id: cache-postgres\n        uses: actions/cache@v4\n        with:\n          path: ~/postgres.tar\n          key: cache-postgres-0\n          restore-keys: |\n            cache-postgres-\n      - name: cache postgres miss\n        if: ${{ (matrix.group == 'faq' || matrix.group == 'unit-core-postgres' || matrix.group == 'unit-storage' || matrix.group == 'permissions') && steps.cache-postgres.outputs.cache-hit != 'true' }}\n        run: docker pull postgres:latest && docker image save postgres:latest --output ~/postgres.tar\n      - name: cache postgres use\n        if: ${{ (matrix.group == 'faq' || matrix.group == 'unit-core-postgres' || matrix.group == 'unit-storage' || matrix.group == 'permissions') && steps.cache-postgres.outputs.cache-hit == 'true' }}\n        run: docker image load --input ~/postgres.tar\n\n      - run: pip install \"laminci@git+https://github.com/laminlabs/laminci\"\n\n      - run: nox -s configure_coverage -- '${{needs.pre-filter.outputs.matrix}}'\n\n      - name: install postgres\n        if: ${{ matrix.group == 'faq' }}\n        run: sudo apt-get install libpq-dev\n\n      - name: install graphviz\n        if: ${{ matrix.group == 'tutorial' || matrix.group == 'guide' || matrix.group == 'biology' || matrix.group == 'faq'}}\n        run: sudo apt-get -y install graphviz\n\n      # - run: nox -s lint\n      #   if: ${{ matrix.group == 'tutorial' }}\n\n      - run: nox -s \"install_ci(group='${{ matrix.group }}')\"\n\n      - uses: aws-actions/configure-aws-credentials@v4\n        with:\n          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}\n          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}\n          aws-region: us-east-1\n      - run: nox -s prepare\n        if: ${{ !startsWith(matrix.group, 'unit-') && !startsWith(matrix.group, 'permissions') }}\n      - run: nox -s \"test(group='${{ matrix.group }}')\"\n\n      - name: upload coverage\n        uses: actions/upload-artifact@v4\n        with:\n          name: coverage--${{ matrix.group }}\n          path: .coverage\n          include-hidden-files: true\n\n      - name: upload docs\n        if: ${{ matrix.group == 'tutorial' || matrix.group == 'guide' || matrix.group == 'tiledbsoma' || matrix.group == 'biology' || matrix.group == 'faq' || matrix.group == 'storage' }}\n        uses: actions/upload-artifact@v4\n        with:\n          name: docs-${{ matrix.group }}\n          path: ./docs/${{ matrix.group }}\n\n  profile:\n    runs-on: ubuntu-latest\n    timeout-minutes: 10\n    env:\n      LAMIN_API_KEY: ${{ secrets.LAMIN_API_KEY_TESTUSER1 }}\n    steps:\n      - uses: actions/checkout@v6\n        with:\n          submodules: recursive\n          fetch-depth: 0\n      - uses: actions/setup-python@v6\n        with:\n          python-version: |\n            ${{ github.ref == 'refs/heads/release' && '3.11' ||\n                '3.14'\n                }}\n      - run: pip install git+https://github.com/laminlabs/laminci\n      - run: nox -s \"install_ci(group='unit-core-sqlite')\"\n      - run: uv pip install --system git+https://github.com/laminlabs/laminprofiler\n      - run: lamin login\n      - run: laminprofiler check tests/profiling/import_lamindb_and_connect.py --threshold 3.5\n      - run: lamin connect laminlabs/lamindata\n      - run: laminprofiler check tests/profiling/import_lamindb.py --threshold 1.5\n      - run: laminprofiler check tests/profiling/import_lamindb_core_storage.py --threshold 1.5\n\n  docs:\n    needs: test\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v6\n        with:\n          submodules: recursive\n          fetch-depth: 0\n\n      - name: checkout lndocs\n        uses: actions/checkout@v6\n        with:\n          repository: laminlabs/lndocs\n          ssh-key: ${{ secrets.READ_LNDOCS }}\n          path: lndocs\n          ref: main\n\n      - uses: aws-actions/configure-aws-credentials@v4\n        with:\n          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}\n          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}\n          aws-region: us-east-1\n\n      - uses: actions/setup-python@v6\n        with:\n          python-version: \"3.12\"\n      - run: pip install \"laminci@git+https://x-access-token:${{ secrets.LAMIN_BUILD_DOCS }}@github.com/laminlabs/laminci\"\n      - run: nox -s \"install_ci(group='docs')\"\n      - uses: actions/download-artifact@v4\n      - run: nox -s clidocs\n      - run: nox -s prepare\n      - run: nox -s docs\n      - run: rm -r ./_build/html/.doctrees # do not want to deploy with cloudflare\n      - uses: cloudflare/wrangler-action@v3\n        id: cloudflare\n        with:\n          apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}\n          accountId: 472bdad691b4483dea759eadb37110bd\n          command: pages deploy \"_build/html\" --project-name=lamindb\n          gitHubToken: ${{ secrets.GITHUB_TOKEN }}\n      - uses: edumserrano/find-create-or-update-comment@v2\n        if: github.event_name == 'pull_request'\n        with:\n          issue-number: ${{ github.event.pull_request.number }}\n          body-includes: \"Deployment URL\"\n          comment-author: \"github-actions[bot]\"\n          body: |\n            Deployment URL: ${{ steps.cloudflare.outputs.deployment-url }}\n          edit-mode: replace\n\n      - uses: peter-evans/repository-dispatch@v2\n        if: ${{ github.event_name == 'push' }}\n        with:\n          token: ${{ secrets.LAMIN_BUILD_DOCS }}\n          repository: \"laminlabs/lamin-docs\"\n          event-type: build\n\n  coverage:\n    needs: test\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v6\n      - uses: actions/setup-python@v6\n        with:\n          python-version: \"3.14\"\n      - run: |\n          python -m pip install -U uv\n          uv pip install --system coverage[toml]\n          uv pip install --system --no-deps .\n\n      - uses: actions/download-artifact@v4\n      - name: run coverage\n        run: |\n          coverage combine coverage--*/.coverage*\n          coverage report --fail-under=0\n          coverage xml\n      - uses: codecov/codecov-action@v2\n        with:\n          token: ${{ secrets.CODECOV_TOKEN }}\n\n  dispatch:\n    if: ${{ github.event_name == 'push' }}\n    runs-on: ubuntu-latest\n    steps:\n      - uses: peter-evans/repository-dispatch@v2\n        with:\n          token: ${{ secrets.LAMIN_BUILD_DOCS }}\n          repository: \"laminlabs/lamindb-dispatch\"\n          event-type: build\n"
  },
  {
    "path": ".github/workflows/doc-changes.yml",
    "content": "name: doc-changes\n\non:\n  pull_request_target:\n    branches:\n      - main\n      - release\n    types:\n      - closed\n\njobs:\n  doc-changes:\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v4\n      - uses: actions/setup-python@v5\n        with:\n          python-version: \"3.11\"\n      - run: pip install \"laminci[doc-changes]@git+https://x-access-token:${{ secrets.LAMIN_BUILD_DOCS }}@github.com/laminlabs/laminci\"\n      - run: laminci doc-changes\n        env:\n          repo_token: ${{ secrets.GITHUB_TOKEN }}\n          docs_token: ${{ secrets.LAMIN_BUILD_DOCS }}\n          changelog_file: lamin-docs/docs/changelog/soon/lamindb.md\n"
  },
  {
    "path": ".gitignore",
    "content": "__MACOSX/\n\n# LaminDB\nREADME_stripped.md\ndocs/scripts/test_artifact_parquet.py\nREADME.ipynb\ndocs/sample.fasta\ndocs/faq/sample.fasta\ndocs/faq/test-acid/\ndocs/scripts/define_mini_immuno_features_labels.py\ndocs/scripts/define_mini_immuno_schema_flexible.py\ndocs/scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py\ndocs/scripts/define_valid_features.py\ndocs/scripts/save_mini_immuno_datasets.py\nprofile_output*\ndocs/cli.md\n.coveragerc\n*.db\n*.lndb\n*.jpg\n*.zarr/\ndocsbuild/\ndocs/lamin.md\ndocs/guide/data-validation.ipynb\ndocs/guide/bionty.ipynb\ndocs/guide/lnschema-core.ipynb\ndocs/paradisi05_laminopathic_nuclei.jpg\nbionty_docs/\nlamindb_docs/\n_build\nmydata/\nlamin-intro/\nlamin-tutorial/\nmytest/\nrds/\nmydb/\ndocs/test-registries/\ndocs/test-annotate-flexible/\ndocs/lamindb.*\nlamin_sphinx\ndocs/conf.py\nlamindb/setup/.env\n_secrets.py\n_configuration.py\nlamin.db\ndocs/generated/*\n_docs_tmp*\ndocs/guide/Laminopathic_nuclei.jpg\ndocs/guide/paradisi05_laminopathic_nuclei.jpg\nnocodb\ndocs/guide/SRR4238351_subsamp.fastq.gz\ndocs/faq/paradisi05_laminopathic_nuclei.jpg\ndocs/faq/tostore/\ndocs/faq/mydata_postgres/\ndocs/guide/myobjects/\ndocs/faq/test-run-inputs/\ndocs/intro/paradisi05_laminopathic_nuclei.jpg\ndocs/guide/figures/\ndocs/test-annotate/\ndocs/test-track/\nsuo22/\ndocs/biology/test-flow/\ndocs/biology/test-scrna/\ndocs/biology/test-registries/\ndocs/biology/test-multimodal/\ndefault_storage\ndefault_storage_unit_core\ndefault_storage_unit_storage\ntest.ipynb\ntest2.ipynb\nrun-tests\ntest-django-validation/\ncurate.tiledbsoma\nsmall_dataset.tiledbsoma\nnonregistered_storage\nregistered_storage\ntests/core/notebooks/no-uid-renamed.ipynb\n\n# General\n.DS_Store\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# ruff\n.ruff_cache\n\n# Pyre type checker\n.pyre/\n\n# data files\ndata/\n_build\n*.csv\n*.fcs\n*.zip\n*.feather\n*.h5ad\n*.h5mu\n*.parquet\n*.bam\n*.fastq.gz\n*.pt\n\n# Pycharm\n.idea\n\n# VSCode\n.vscode\n\n# CELLxGENE\n!lamindb/examples/cellxgene/cellxgene_schema_versions.csv\n\n# ml\nlightning_logs\nmlruns\ndownload_mnist\ncheckpoints\ntest_lightning\n"
  },
  {
    "path": ".gitmodules",
    "content": "[submodule \"sub/lamindb-setup\"]\n\tpath = sub/lamindb-setup\n\turl = https://github.com/laminlabs/lamindb-setup\n[submodule \"sub/lamin-cli\"]\n\tpath = sub/lamin-cli\n\turl = https://github.com/laminlabs/lamin-cli\n[submodule \"sub/bionty\"]\n\tpath = sub/bionty\n\turl = https://github.com/laminlabs/bionty\n[submodule \"sub/pertdb\"]\n\tpath = sub/pertdb\n\turl = https://github.com/laminlabs/pertdb\n[submodule \"sub/cellxgene-lamin\"]\n\tpath = sub/cellxgene-lamin\n\turl = https://github.com/laminlabs/cellxgene-lamin.git\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "fail_fast: false\ndefault_language_version:\n  python: python3\ndefault_stages:\n  - pre-commit\n  - pre-push\nminimum_pre_commit_version: 2.16.0\nrepos:\n  - repo: https://github.com/rbubley/mirrors-prettier\n    rev: v3.5.1\n    hooks:\n      - id: prettier\n        exclude: |\n          (?x)(\n            docs/changelog.md|.github/ISSUE_TEMPLATE/config.yml|tests/core/notebooks/basic-r-notebook.Rmd.cleaned.html|README.md\n          )\n  - repo: https://github.com/kynan/nbstripout\n    rev: 0.8.1\n    hooks:\n      - id: nbstripout\n        exclude: |\n          (?x)(\n              docs/examples/|\n              docs/notes/\n          )\n  - repo: https://github.com/astral-sh/ruff-pre-commit\n    rev: v0.9.10\n    hooks:\n      - id: ruff\n        args: [--fix, --exit-non-zero-on-fix, --unsafe-fixes]\n      - id: ruff-format\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v4.5.0\n    hooks:\n      - id: detect-private-key\n      - id: check-ast\n      - id: end-of-file-fixer\n        exclude: |\n          (?x)(\n              .github/workflows/latest-changes.jinja2\n            )\n      - id: mixed-line-ending\n        args: [--fix=lf]\n      - id: trailing-whitespace\n        exclude: |\n          (?x)(\n              tests/core/notebooks/basic-r-notebook.Rmd.cleaned.html\n            )\n      - id: check-case-conflict\n  - repo: https://github.com/pre-commit/mirrors-mypy\n    rev: v1.14.1\n    hooks:\n      - id: mypy\n        args:\n          [\n            --no-strict-optional,\n            --ignore-missing-imports,\n            --disable-error-code=annotation-unchecked,\n            --disable-error-code=type-arg,\n            --namespace-packages,\n            --explicit-package-bases,\n          ]\n        additional_dependencies:\n          [\"types-requests\", \"types-attrs\", \"types-PyYAML\"]\n        exclude: |\n          (?x)(\n              test_notebooks.py|\n              script-to-test-versioning.py|\n              tests/storage/conftest.py|\n              tests/curators/conftest.py|\n              tests/permissions/conftest.py|\n              tests/writelog/conftest.py|\n              tests/writelog_sqlite/conftest.py|\n              tests/curators/test_curators_examples.py|\n              tests/core/conftest.py|\n              docs/scripts/\n          )\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing\n\nContributions are generally welcome. Please make an issue to discuss proposals.\n\n## Installation\n\n### PyPI\n\nFor installation from PyPI, see [docs.lamin.ai/setup](https://docs.lamin.ai/setup).\n\n### Github\n\nFor installation from GitHub, call:\n\n```bash\ngit clone --recursive https://github.com/laminlabs/lamindb\npip install laminci\npython -m venv .venv\nsource .venv/bin/activate\nnox -s install\n```\n\nThis will install a few dependencies from the git submodules linked [here](https://github.com/laminlabs/lamindb/tree/main/sub), as well as packages\nlike `pytest` and `pre-commit` that you'll need when developing.\n\nlamindb depends on several other packages that may require modifications for pull requests to successfully pass the continuous integration build.\nWe suggest the following workflow if commits to any of the submodules are essential for the current modifications in lamindb:\n\n1. Change directory into the submodule that you want to modify: `cd sub/SUBMODULE`.\n2. Switch to a new feature branch: `git switch -c feature/NEWFEATURE`.\n3. Make a pull request with your changes to the `SUBMODULE` and ensure that the CI passes.\n4. In the repository root of lamindb, create a new commit and push:\n\n```bash\ncd ..\ngit add -u\ngit commit -m \"Upgraded SUBMODULE\"\ngit push\n```\n\nAny pull request of yours should now also have the changes of the submodule included allowing you to test that changes in the submodule and lamindb are compatible.\n\n## Running and writing tests\n\nThis package uses the [pytest][] for automated testing.\nPlease add a test for every function added to the package.\n\nRunning tests requires the [Docker daemon][] up, then run at the root of the repository:\n\n```bash\npytest --ignore=tests/storage --ignore=tests/permission\n```\n\nin the root of the repository.\nWe exclude specific directories in local `pytest` runs because they directly access external resources such as AWS, which require specific access keys.\nContinuous integration will automatically run **all** tests on pull requests.\n\n## Code-style\n\nThis project uses [pre-commit][] to enforce consistent code-styles. On every commit, pre-commit checks will either\nautomatically fix issues with the code, or raise an error message.\n\nTo enable pre-commit locally, simply run\n\n```bash\npre-commit install\n```\n\nin the root of the repository. Pre-commit will automatically download all dependencies when it is run for the first time.\n\nWe further use [gitmoji][] to add emoticons to commits.\nThese allow us to more easily categorize them allowing for faster visual filtering.\n\nIt can be installed by running:\n\n```bash\nnpm i -g gitmoji-cli\n```\n\nand enabled for the repository via:\n\n```bash\ngitmoji -i\n```\n\nIf you don't have `sudo` in your working environment, follow [these instructions](https://github.com/sindresorhus/guides/blob/main/npm-global-without-sudo.md).\n\n## Documentation\n\nWe build our documentation with an internal tool called `lndocs`.\nWe have not made it public yet and therefore external contributors need to rely on the Github Actions `docs` job to build the documentation.\nIf the `docs` job succeeds, a preview URL will be posted automatically as a comment to your pull request.\n\n## Releases\n\nCurrently only lamin employees have release rights.\n\nRelease publishing is managed via `laminci release --pypi`. For `lamindb`, the\nrelease flow now publishes two distributions in sequence:\n\n- `lamindb-core` (contains the `lamindb/` namespace package)\n- `lamindb` (meta-package that depends on `lamindb-core`)\n\nBefore first production publish of a version, run a TestPyPI dry run by\nbuilding both wheels from `pyproject.toml` and `pyproject.full.toml`, then\nuploading with `twine` to TestPyPI for verification.\n\n[Docker daemon]: https://docs.docker.com/engine/install/\n[gitmoji]: https://gitmoji.dev/\n[pre-commit]: https://pre-commit.com/\n[pytest]: https://docs.pytest.org/\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.md",
    "content": "[![docs](https://img.shields.io/badge/docs-yellow)](https://docs.lamin.ai) [![llms.txt](https://img.shields.io/badge/llms.txt-orange)](https://docs.lamin.ai/llms.txt) [![codecov](https://codecov.io/gh/laminlabs/lamindb/branch/main/graph/badge.svg?token=VKMRJ7OWR3)](https://codecov.io/gh/laminlabs/lamindb) [![pypi](https://img.shields.io/pypi/v/lamindb?color=blue&label=PyPI)](https://pypi.org/project/lamindb) [![cran](https://www.r-pkg.org/badges/version/laminr?color=green)](https://cran.r-project.org/package=laminr) [![stars](https://img.shields.io/github/stars/laminlabs/lamindb?style=flat&logo=GitHub&label=&color=gray)](https://github.com/laminlabs/lamindb) [![downloads](https://static.pepy.tech/personalized-badge/lamindb?period=total&units=INTERNATIONAL_SYSTEM&left_color=GRAY&right_color=GRAY&left_text=%E2%AC%87%EF%B8%8F)](https://pepy.tech/project/lamindb)\n\n# LaminDB - Open-source data framework for biology\n\nLaminDB allows you to query, trace, and validate datasets and models at scale.\nYou get context & memory through a lineage-native lakehouse that supports bio-formats, registries & ontologies while feeling as simple as a file system.\n\nAgent? [llms.txt](https://docs.lamin.ai/llms.txt)\n\n<details>\n<summary>Why?</summary>\n\n(1) Reproducing, tracing & understanding how datasets, models & results are created is critical to quality R&D.\nWithout context, humans & agents make mistakes and cannot close feedback loops across data generation & analysis.\nWithout memory, compute & intelligence are wasted on fragmented, non-compounding tasks — LLM context windows are small.\n\n(2) Training & fine-tuning models with thousands of datasets — across LIMS, ELNs, orthogonal assays — is now a primary path to scaling R&D.\nBut without queryable & validated data or with data locked in organizational & infrastructure silos, it leads to garbage in, garbage out or is quite simply impossible.\n\nImagine building software without git or pull requests: an agent's actions would be impossible to verify.\nWhile code has git and tables have dbt/warehouses, biological data has lacked a framework for managing its unique complexity.\n\nLaminDB fills the gap.\nIt is a lineage-native lakehouse that understands bio-registries and formats (`AnnData`, `.zarr`, …) based on the established open data stack:\nPostgres/SQLite for metadata and cross-platform storage for datasets.\nBy offering queries, tracing & validation in a single API, LaminDB provides the context & memory to turn messy, agentic biological R&D into a scalable process.\n\n</details>\n\n<img width=\"800px\" src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/BunYmHkyFLITlM5M000D.svg\">\n\nHow?\n\n- **lineage** → track inputs & outputs of notebooks, scripts, functions & pipelines with a single line of code\n- **lakehouse** → manage, monitor & validate schemas for standard and bio formats; query across many datasets\n- **FAIR datasets** → validate & annotate `DataFrame`, `AnnData`, `SpatialData`, `parquet`, `zarr`, …\n- **LIMS & ELN** → programmatic experimental design with bio-registries, ontologies & markdown notes\n- **unified access** → storage locations (local, S3, GCP, …), SQL databases (Postgres, SQLite) & ontologies\n- **reproducible** → auto-track source code & compute environments with data & code versioning\n- **change management** → branching & merging similar to git, plan management for agents\n- **zero lock-in** → runs anywhere on open standards (Postgres, SQLite, `parquet`, `zarr`, etc.)\n- **scalable** → you hit storage & database directly through your `pydata` or R stack, no REST API involved\n- **simple** → just `pip install` from PyPI or `install.packages('laminr')` from CRAN\n- **distributed** → zero-copy & lineage-aware data sharing across infrastructure (databases & storage locations)\n- **integrations** → [git](https://docs.lamin.ai/track#sync-code-with-git), [nextflow](https://docs.lamin.ai/nextflow), [vitessce](https://docs.lamin.ai/vitessce), [redun](https://docs.lamin.ai/redun), and [more](https://docs.lamin.ai/integrations)\n- **extensible** → create custom plug-ins based on the Django ORM, the basis for LaminDB's registries\n\nGUI, permissions, audit logs? [LaminHub](https://lamin.ai) is a collaboration hub built on LaminDB similar to how GitHub is built on git.\n\n<details>\n<summary>Who?</summary>\n\nScientists and engineers at leading research institutions and biotech companies, including:\n\n- **Industry** → Pfizer, Altos Labs, Ensocell Therapeutics, ...\n- **Academia & Research** → scverse, DZNE (National Research Center for Neuro-Degenerative Diseases), Helmholtz Munich (National Research Center for Environmental Health), ...\n- **Research Hospitals** → Global Immunological Swarm Learning Network: Harvard, MIT, Stanford, ETH Zürich, Charité, U Bonn, Mount Sinai, ...\n\nFrom personal research projects to pharma-scale deployments managing petabytes of data across:\n\nentities | OOMs\n--- | ---\nobservations & datasets | 10¹² & 10⁶\nruns & transforms| 10⁹ & 10⁵\nproteins & genes | 10⁹ & 10⁶\nbiosamples & species | 10⁵ & 10²\n... | ...\n\n</details>\n\n## Quickstart\n\nTo install the Python package with recommended dependencies, use:\n\n```shell\npip install lamindb\n```\n\n<details>\n<summary>Install with minimal dependencies.</summary>\n\nThe `lamindb` package adds data-science related dependencies, those that come with the `[full]` extra, see [here](https://github.com/laminlabs/lamindb/blob/2cc91adcf6077c5af69c1a098699085bb0844083/pyproject.toml#L30-L49).\n\nIf you want a maximally lightweight install of the `lamindb` namespace, use:\n\n```shell\npip install lamindb-core\n```\n\nThis suffices to support the basic functionality but you will get an `ImportError` if you're e.g. trying to validate a `DataFrame` because that requires `pandera`.\n\n</details>\n\n### Query databases & load artifacts\n\nYou can browse public databases at [lamin.ai/explore](https://lamin.ai/explore). To query [laminlabs/cellxgene](https://lamin.ai/laminlabs/cellxgene), run:\n\n```python\nimport lamindb as ln\n\ndb = ln.DB(\"laminlabs/cellxgene\")  # a database object for queries\ndf = db.Artifact.to_dataframe()    # a dataframe listing datasets & models\n```\n\nTo get a [specific dataset](https://lamin.ai/laminlabs/cellxgene/artifact/BnMwC3KZz0BuKftR), run:\n\n```python\nartifact = db.Artifact.get(\"BnMwC3KZz0BuKftR\")  # a metadata object for a dataset\nartifact.describe()                             # describe the context of the dataset\n```\n\n<details>\n<summary>See the output.</summary>\n<img src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/mxlUQiRLMU4Zos6k0001.png\" width=\"550\">\n</details>\n\nAccess the content of the dataset via:\n\n```python\nlocal_path = artifact.cache()  # return a local path from a cache\nadata = artifact.load()        # load object into memory\naccessor = artifact.open()     # return a streaming accessor\n```\n\nYou can query by biological entities like `Disease` through plug-in `bionty`:\n\n```python\nalzheimers = db.bionty.Disease.get(name=\"Alzheimer disease\")\ndf = db.Artifact.filter(diseases=alzheimers).to_dataframe()\n```\n\n### Configure your database\n\nYou can create a LaminDB instance at [lamin.ai](https://lamin.ai) and invite collaborators.\nTo connect to an existing instance, run:\n\n```shell\n# log into LaminHub\nlamin login\n# then either\nlamin connect account/name  # connect globally in your environment\n# or\nlamin connect --here account/name  # connect in your current development directory\n```\n\nIf you prefer to init a new instance instead (no login required), run:\n\n```shell\nlamin init --storage ./quickstart-data --modules bionty\n```\n\nFor more configuration, read: [docs.lamin.ai/setup](https://docs.lamin.ai/setup).\n\nOn the terminal and in a Python session, LaminDB will now auto-connect.\n\n### Save files & folders as artifacts\n\nTo save a file or folder via the API:\n\n```python\nimport lamindb as ln\n# → connected lamindb: account/instance\n\nopen(\"sample.fasta\", \"w\").write(\">seq1\\nACGT\\n\")        # create dataset\nln.Artifact(\"sample.fasta\", key=\"sample.fasta\").save()  # save dataset\n```\n\nTo save a file or folder via the CLI, run:\n\n```shell\nlamin save sample.fasta --key sample.fasta\n```\n\nTo load an artifact via the CLI into a local cache, run:\n\n```shell\nlamin load --key sample.fasta\n```\n\nRead more about the CLI: [docs.lamin.ai/cli](https://docs.lamin.ai/cli).\n\n### Lineage: scripts & notebooks\n\nTo create a dataset while tracking source code, inputs, outputs, logs, and environment:\n\n```python\nimport lamindb as ln\n# → connected lamindb: account/instance\n\nln.track()                                              # track code execution\nopen(\"sample.fasta\", \"w\").write(\">seq1\\nACGT\\n\")        # create dataset\nln.Artifact(\"sample.fasta\", key=\"sample.fasta\").save()  # save dataset\nln.finish()                                             # mark run as finished\n```\n\nRunning this snippet as a script (`python create-fasta.py`) produces the following data lineage:\n\n```python\nartifact = ln.Artifact.get(key=\"sample.fasta\")  # get artifact by key\nartifact.describe()      # context of the artifact\nartifact.view_lineage()  # fine-grained lineage\n```\n\n<img src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/BOTCBgHDAvwglN3U0004.png\" width=\"550\"> <img src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/EkQATsQL5wqC95Wj0006.png\" width=\"140\">\n\nWatch a mini video: [youtu.be/jwnHu1PbA9Q](https://youtu.be/jwnHu1PbA9Q)\n\n<details>\n<summary>Access run & transform.</summary>\n\n```python\nrun = artifact.run              # get the run object\ntransform = artifact.transform  # get the transform object\nrun.describe()                  # context of the run\n```\n\n<img src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/rJrHr3XaITVS4wVJ0000.png\" width=\"550\" />\n\n```python\ntransform.describe()  # context of the transform\n```\n\n<img src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/JYwmHBbgf2MRCfgL0000.png\" width=\"550\" />\n\n</details>\n\n<details>\n<summary>Track a project or an agent plan.</summary>\n\nPass a project/artifact to `ln.track()`, for example:\n\n```python\nln.track(project=\"My project\", plan=\"./plans/curate-dataset-x.md\")\n```\n\nNote that you have to create a project or save the agent plan in case they don't yet exist:\n\n```shell\n# create a project with the CLI\nlamin create project \"My project\"\n\n# save an agent plan with the CLI\nlamin save /path/to/.cursor/plans/curate-dataset-x.plan.md\nlamin save /path/to/.claude/plans/curate-dataset-x.md\n```\n\nOr in Python:\n\n```python\nln.Project(name=\"My project\").save()  # create a project in Python\n```\n\n</details>\n\n\n### Lineage: functions & workflows\n\nYou can achieve the same traceability for functions & workflows:\n\n<!-- #skip_laminr -->\n\n```python\nimport lamindb as ln\n\n@ln.flow()\ndef create_fasta(fasta_file: str = \"sample.fasta\"):\n    open(fasta_file, \"w\").write(\">seq1\\nACGT\\n\")    # create dataset\n    ln.Artifact(fasta_file, key=fasta_file).save()  # save dataset\n\nif __name__ == \"__main__\":\n    create_fasta()\n```\n\n<!-- #end_skip_laminr -->\n\nBeyond what you get for scripts & notebooks, this automatically tracks function & CLI params and integrates well with established Python workflow managers: [docs.lamin.ai/track](https://docs.lamin.ai/track). To integrate advanced bioinformatics pipeline managers like Nextflow, see [docs.lamin.ai/pipelines](https://docs.lamin.ai/pipelines).\n\n<details>\n<summary>A richer example.</summary>\n\nHere is an automatically generated re-construction of the project of [Schmidt _et al._ (Science, 2022)](https://pubmed.ncbi.nlm.nih.gov/35113687/):\n\n<img src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/KQmzmmLOeBN0C8Yk0004.png\" width=\"850\">\n\nA phenotypic CRISPRa screening result is integrated with scRNA-seq data. Here is the result of the screen input:\n\n<img src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/JvLaK9Icj11eswQn0000.png\" width=\"850\">\n\nYou can explore it [here](https://lamin.ai/laminlabs/lamindata/artifact/W1AiST5wLrbNEyVq) on LaminHub or [here](https://github.com/laminlabs/schmidt22) on GitHub.\n\n</details>\n\n### Labeling & queries by fields\n\nYou can label an artifact by running:\n\n```python\nmy_label = ln.ULabel(name=\"My label\").save()   # a universal label\nproject = ln.Project(name=\"My project\").save() # a project label\nartifact.ulabels.add(my_label)\nartifact.projects.add(project)\n```\n\nQuery for it:\n\n```python\nln.Artifact.filter(ulabels=my_label, projects=project).to_dataframe()\n```\n\nYou can also query by the metadata that lamindb automatically collects:\n\n```python\nln.Artifact.filter(run=run).to_dataframe()              # by creating run\nln.Artifact.filter(transform=transform).to_dataframe()  # by creating transform\nln.Artifact.filter(size__gt=1e6).to_dataframe()         # size greater than 1MB\n```\n\nIf you want to include more information into the resulting dataframe, pass `include`.\n\n```python\nln.Artifact.to_dataframe(include=[\"created_by__name\", \"storage__root\"])  # include fields from related registries\n```\n\nNote: The query syntax for `DB` objects and for your default database is the same.\n\n### The core data model\n\nHere is an overview that illustrates how `Artifact` links to all other registries:\n\n<img width=\"800px\" src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/HMfWLa1rFkxcxQEN0000.svg\">\n\n### Queries by features\n\nYou can annotate datasets and samples with features. Let's define some:\n\n```python\nfrom datetime import date\n\nln.Feature(name=\"gc_content\", dtype=float).save()\nln.Feature(name=\"experiment_note\", dtype=str).save()\nln.Feature(name=\"experiment_date\", dtype=date, coerce=True).save()  # accept date strings\n```\n\nDuring annotation, feature names and data types are validated against these definitions.\n\n```python\nartifact.features.set_values({\n    \"gc_content\": 0.55,\n    \"experiment_note\": \"Looks great\",\n    \"experiment_date\": \"2025-10-24\",\n})\n```\n\nQuery for it:\n\n```python\nln.Artifact.filter(experiment_date=\"2025-10-24\").to_dataframe()  # query all artifacts annotated with `experiment_date`\n```\n\nIf you want to include the feature values into the dataframe, pass `include`.\n\n```python\nln.Artifact.to_dataframe(include=\"features\")  # include the feature annotations\n```\n\n### Lake ♾️ LIMS ♾️ Sheets\n\nYou can create records for the entities underlying your experiments: samples, perturbations, instruments, etc., for example:\n\n```python\nln.Record(name=\"Sample 1\", features={\"gc_content\": 0.5}).save()\n```\n\nYou can create relationships of entities:\n\n```python\n# create a flexible record type to track experiments\nexperiment_type = ln.Record(name=\"Experiment\", is_type=True).save()\n\n# create a record of type `Experiment` for your first experiment\nln.Record(name=\"Experiment 1\", type=experiment_type).save()\n\n# create a feature to link experiments in records, dataframes, etc.\nln.Feature(name=\"experiment\", dtype=experiment_type).save()\n\n# create a sample record that links the sample to `Experiment 1` via the `experiment` feature\nln.Record(name=\"Sample 2\", features={\"gc_content\": 0.5, \"experiment\": \"Experiment 1\"}).save()\n```\n\nYou can convert any record type to dataframe/sheet:\n\n```python\nexperiment_type.to_dataframe()\n```\n\n<details>\n<summary>You can edit records like Excel sheets on LaminHub.</summary>\n<img width=\"800px\" src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/XSzhWUb0EoHOejiw0001.png\">\n</details>\n\n### Data versioning\n\nIf you change source code or datasets, LaminDB manages versioning for you.\nAssume you run a new version of our `create-fasta.py` script to create a new version of `sample.fasta`.\n\n```python\nimport lamindb as ln\n\nln.track()\nopen(\"sample.fasta\", \"w\").write(\">seq1\\nTGCA\\n\")  # a new sequence\nln.Artifact(\"sample.fasta\", key=\"sample.fasta\", features={\"experiment\": \"Experiment 1\"}).save()  # annotate with the new experiment\nln.finish()\n```\n\nIf you now query by `key`, you'll get the latest version of this artifact:\n\n```python\nartifact = ln.Artifact.get(key=\"sample.fasta\")  # get artifact by key\nartifact.versions.to_dataframe()                # see all versions of that artifact\n```\n\n### Change management\n\nTo create a contribution branch and switch to it, run:\n\n```shell\nlamin switch -c my_branch\n```\n\nTo merge a contribution branch into `main`, run:\n\n```shell\nlamin switch main  # switch to the main branch\nlamin merge my_branch  # merge contribution branch into main\n```\n\nRead more: [docs.lamin.ai/lamindb.branch](https://docs.lamin.ai/lamindb.branch).\n\n### Data sharing\n\nTo share data in a lineage-aware way, sync objects from a source database to your default database:\n\n```python\ndb = ln.DB(\"laminlabs/lamindata\")\nartifact = db.Artifact.get(key=\"example_datasets/mini_immuno/dataset1.h5ad\")\nartifact.save()\n```\n\nThis is zero-copy for the artifact's data in storage. Read more: [docs.lamin.ai/sync](https://docs.lamin.ai/sync).\n\n### Lakehouse ♾️ feature store\n\nHere is how you ingest a `DataFrame`:\n\n```python\nimport pandas as pd\n\ndf = pd.DataFrame({\n    \"sequence_str\": [\"ACGT\", \"TGCA\"],\n    \"gc_content\": [0.55, 0.54],\n    \"experiment_note\": [\"Looks great\", \"Ok\"],\n    \"experiment_date\": [date(2025, 10, 24), date(2025, 10, 25)],\n})\nln.Artifact.from_dataframe(df, key=\"my_datasets/sequences.parquet\").save()  # no validation\n```\n\nTo validate & annotate the content of the dataframe, use the built-in schema `valid_features`:\n\n```python\nln.Feature(name=\"sequence_str\", dtype=str).save()  # define a remaining feature\nartifact = ln.Artifact.from_dataframe(\n    df,\n    key=\"my_datasets/sequences.parquet\",\n    schema=\"valid_features\"  # validate columns against features\n).save()\nartifact.describe()\n```\n\nWatch a mini video: [youtu.be/Ji6E7hTnReQ](https://youtu.be/Ji6E7hTnReQ)\n\nYou can filter for datasets by schema and then launch distributed queries and batch loading.\n\n### Lakehouse beyond tables\n\nTo validate an `AnnData` with built-in schema `ensembl_gene_ids_and_valid_features_in_obs`, call:\n\n```python\nimport anndata as ad\nimport numpy as np\nimport pandas as pd\n\nadata = ad.AnnData(\n    X=np.ones((21, 10)),\n    obs=pd.DataFrame({'cell_type_by_model': ['T cell', 'B cell', 'NK cell'] * 7}),\n    var=pd.DataFrame(index=[f'ENSG{i:011d}' for i in range(10)])\n)\nartifact = ln.Artifact.from_anndata(\n    adata,\n    key=\"my_datasets/scrna.h5ad\",\n    schema=\"ensembl_gene_ids_and_valid_features_in_obs\"\n).save()\nartifact.describe()\n```\n\nTo validate a `SpatialData` or any other array-like dataset, you need to construct a `Schema`. You can do this by composing simple `pandera`-style schemas: [docs.lamin.ai/curate](https://docs.lamin.ai/curate).\n\n### Ontologies\n\nPlugin `bionty` gives you >20 public ontologies as `SQLRecord` registries. This was used to validate the `ENSG` ids in the `adata` just before.\n\n```python\nimport bionty as bt\n\nbt.CellType.import_source()  # import the default ontology\nbt.CellType.to_dataframe()   # your extensible cell type ontology in a simple registry\n```\n\nYou can then create objects, e.g. for labeling, analogous to `ULabel`, `Project`, or `Record`:\n\n```python\nt_cell = bt.CellType.get(name=\"T cell\")\nartifact.cell_types.add(t_cell)\n```\n\nRead more: [docs.lamin.ai/manage-ontologies](https://docs.lamin.ai/manage-ontologies).\n\nWatch a mini video: [youtu.be/3vpWjHj3Kw8](https://youtu.be/3vpWjHj3Kw8)\n\n### Save unstructured notes\n\nWhen in your development directory, you can save markdown files as records:\n\n```shell\nlamin save <topic>/<my-note.md>\n```\n"
  },
  {
    "path": "docs/api.md",
    "content": "# API Reference\n\n<meta http-equiv=\"Refresh\" content=\"0; url=./lamindb.html\" />\n\n```{toctree}\n:maxdepth: 1\n:caption: CLI & lamindb\n:hidden:\n\ncli\nlamindb\n```\n\n```{toctree}\n:maxdepth: 1\n:caption: Modules\n:hidden:\n\nbionty\npertdb\n```\n"
  },
  {
    "path": "docs/arrays.md",
    "content": "---\nexecute_via: python\n---\n\n# Stream datasets from storage\n\nThis guide walks through streaming datasets from disk or cloud storage.\n\n```python\n# replace with your username and S3 bucket\n!lamin login testuser1\n!lamin init --storage s3://lamindb-ci/test-arrays\n```\n\nImport lamindb and track this notebook.\n\n```python\nimport lamindb as ln\nimport numpy as np\n\nln.track()\ndb = ln.DB(\"laminlabs/lamindata\")  # we'll pull dataset from there\n```\n\n## DataFrame\n\n### Streaming from a single artifact\n\nA dataframe stored as sharded `parquet`.\n\n```python\nartifact = db.Artifact.get(key=\"sharded_parquet\")\n```\n\n```python\nartifact.path.view_tree()\n```\n\n```python\ndataset = artifact.open()\n```\n\nThis returns a [pyarrow dataset](https://arrow.apache.org/docs/python/dataset.html).\n\n```python\ndataset\n```\n\n```python\ndataset.head(5).to_pandas()\n```\n\n### Streaming from a set of artifacts\n\nYou can open several parquet files as a single dataset by calling `.open()` on the result of a query:\n\n```python\ndataset = db.Artifact.filter(\n    key__startswith=\"example_datasets/small\", suffix=\".parquet\", is_latest=True\n).open()  # open an ArtifactSet for streaming\ndataset\n```\n\nThe same is possible for the artifacts in a collection:\n\n```python\ncollection = db.Collection.get(key=\"sharded_parquet_collection\")\ndataset = collection.open()\ndataset\n```\n\nOnce you have a storage-backed dataset, you can query it like this:\n\n```python\ndataset.to_table().to_pandas()\n```\n\nBy default `Artifact.open()` and `Collection.open()` use `pyarrow` to lazily open dataframes. `polars` can be also used by passing `engine=\"polars\"`. Note also that `.open(engine=\"polars\")` returns a context manager with [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html).\n\n```python\nwith collection.open(engine=\"polars\", use_fsspec=True) as lazy_df:\n    display(lazy_df.collect().to_pandas())\n```\n\n## AnnData\n\nWe'll need some test data:\n\n```python\nln.Artifact(\"s3://lamindb-ci/test-arrays/pbmc68k.h5ad\").save()\nln.Artifact(\"s3://lamindb-ci/test-arrays/testfile.hdf5\").save()\n```\n\nAn `h5ad` artifact stored on s3:\n\n```python\nartifact = ln.Artifact.get(key=\"pbmc68k.h5ad\")\n```\n\n```python\nartifact.path\n```\n\n```python\nadata = artifact.open()\n```\n\nThis object is an `AnnDataAccessor` object, an `AnnData` object backed in the cloud:\n\n```python\nadata\n```\n\nWithout subsetting, the `AnnDataAccessor` object references underlying lazy `h5` or `zarr` arrays:\n\n```python\nadata.X\n```\n\nYou can subset it like a normal `AnnData` object:\n\n```python\nobs_idx = adata.obs.cell_type.isin([\"Dendritic cells\", \"CD14+ Monocytes\"]) & (\n    adata.obs.percent_mito <= 0.05\n)\nadata_subset = adata[obs_idx]\nadata_subset\n```\n\nSubsets load arrays into memory upon direct access:\n\n```python\nadata_subset.X\n```\n\nTo load the entire subset into memory as an actual `AnnData` object, use `to_memory()`:\n\n```python\nadata_subset.to_memory()\n```\n\nIt is also possible to add columns to `.obs` and `.var` of cloud AnnData objects without downloading them. First, create a new `AnnData` `zarr` artifact:\n\n```python\nadata_subset.to_memory().write_zarr(\"adata_subset.zarr\")\nartifact = ln.Artifact(\n    \"adata_subset.zarr\", description=\"test add column to adata\"\n).save()\n```\n\nThis is how you add a column:\n\n```python\nwith artifact.open(mode=\"r+\") as adata_accessor:\n    adata_accessor.add_column(where=\"obs\", col_name=\"ones\", col=np.ones(adata_accessor.shape[0]))\n    display(adata_accessor)\n```\n\nThe version of the artifact is updated after the modification.\n\n```python\nartifact\n```\n\n```python\nartifact.delete(permanent=True)\n```\n\n## SpatialData\n\nIt is also possible to access `AnnData` objects inside `SpatialData` `tables`:\n\n```python\nartifact = ln.Artifact.connect(\"laminlabs/lamindata\").get(\n    key=\"visium_aligned_guide_min.zarr\"\n)\n\naccess = artifact.open()\n```\n\n```python\naccess\n```\n\n```python\naccess.tables\n```\n\nThis gives you the same `AnnDataAccessor` object as for a normal `AnnData`.\n\n```python\ntable = access.tables[\"table\"]\n\ntable\n```\n\nYou can subset it and read into memory as an actual `AnnData`:\n\n```python\ntable_subset = table[table.obs[\"clone\"] == \"diploid\"]\n\ntable_subset\n```\n\n<!-- #region -->\n\n```python\nadata = table_subset.to_memory()\n```\n\n<!-- #endregion -->\n\n## Generic HDF5\n\nLet us query a generic HDF5 artifact:\n\n```python\nartifact = ln.Artifact.get(key=\"testfile.hdf5\")\n```\n\nAnd get a backed accessor:\n\n```python\nbacked = artifact.open()\n```\n\nThe returned object contains the `.connection` and `h5py.File` or `zarr.Group` in `.storage`\n\n```python\nbacked\n```\n\n```python\nbacked.storage\n```\n\n```python\n# clean up test instance\nln.setup.delete(\"test-arrays\", force=True)\n```\n"
  },
  {
    "path": "docs/bionty.md",
    "content": "# `bionty`\n\n```{eval-rst}\n.. automodule:: bionty\n```\n"
  },
  {
    "path": "docs/changelog.md",
    "content": "# Changelog\n\nActual content in lamin-docs.\n"
  },
  {
    "path": "docs/curate.md",
    "content": "---\nexecute_via: python\n---\n\n# Validate & standardize datasets\n\nData curation with LaminDB ensures your datasets are **validated** and **queryable** through **annotation**.\n\n```{raw} html\n<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/Ji6E7hTnReQ?si=K0OnU2MTGv4fIhFo\" title=\"YouTube video player\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen></iframe>\n```\n\nCurating a dataset with LaminDB means three things:\n\n- **Validate** that the dataset matches a desired schema.\n- **Standardize** the dataset (e.g., by fixing typos, mapping synonyms) or update registries if validation fails.\n- **Annotate** the dataset by linking it against metadata entities so that it becomes queryable.\n\nIn this guide we'll curate common data structures.\nHere is a [guide](/faq/curate-any) for the underlying low-level API.\n\nNote: If you know either `pydantic` or `pandera`, here is an [FAQ](/faq/pydantic-pandera) that compares LaminDB with both of these tools.\n\n```python\n# pip install lamindb\n!lamin init --storage ./test-curate --modules bionty\n```\n\n```python\nimport lamindb as ln\n\nln.track()\n```\n\n<!-- #region -->\n\n## Schema design patterns\n\nA {class}`~lamindb.Schema` in LaminDB is a specification that defines the expected structure, data types, and validation rules for a dataset.\nIt is similar to `pydantic.Model` for dictionaries, and `pandera.Schema`, and `pyarrow.lib.Schema` for tables, but supporting more complicated data structures.\n\nSchemas ensure data consistency by defining:\n\n- What {class}`~lamindb.Feature`s (dimensions) exist in your dataset\n- What data types those features should have\n- What values are valid for categorical features\n- Which {class}`~lamindb.Feature`s are required vs optional\n\nAn exemplary schema:\n\n```python\nschema = ln.Schema(\n    name=\"experiment_schema\",           # human-readable name\n    features=[                          # required features\n        ln.Feature(name=\"cell_type\", dtype=bt.CellType),\n        ln.Feature(name=\"treatment\", dtype=str),\n    ],\n    otype=\"DataFrame\"                   # object type (DataFrame, AnnData, etc.)\n)\n```\n\nFor composite data structures using slots:\n\n```{dropdown} What are slots?\n\nFor composite data structures, you need to specify which component contains which schema, for example, to validate both cell metadata in `.obs` and gene metadata in `.var` within the same schema.\nEach slot is a key like `\"obs\"` for AnnData observations,`\"rna:var\"` for MuData modalities, or `\"attrs:nested:key\"` for SpatialData annotations.\n```\n\n```python\n# AnnData with multiple \"slots\"\nadata_schema = ln.Schema(\n    otype=\"AnnData\",\n    slots={\n        \"obs\": cell_metadata_schema,     # cell annotations\n        \"var.T\": gene_id_schema          # gene-derived features\n    }\n)\n```\n\nBefore diving into curation, let's understand the different schema approaches and when to use each one.\nThink of schemas as rules that define what valid data should look like.\n\n<!-- #endregion -->\n\n### Flexible schema\n\nUse when: You want to validate those columns whose names match feature names in your `Feature` registry.\n\n```{eval-rst}\n.. literalinclude:: scripts/define_valid_features.py\n   :language: python\n```\n\n### Minimal required schema\n\nUse when: You need certain columns but want flexibility for additional metadata.\n\n```{eval-rst}\n.. literalinclude:: scripts/define_mini_immuno_schema_flexible.py\n   :language: python\n```\n\n<!-- #region -->\n\n### Strict Schema\n\nUse when: You need complete control over data structure and values.\n\n```python\n# Only allows specified columns\nschema = ln.Schema(\n    features=[...],\n    minimal_set=True,  # whether all passed features are required\n    maximal_set=False  # whether additional features are allowed\n)\n```\n\n<!-- #endregion -->\n\n## DataFrame\n\n### Step 1: Load and examine your data\n\nWe'll be working with the mini immuno dataset:\n\n```python\ndf = ln.examples.datasets.mini_immuno.get_dataset1(\n    with_cell_type_synonym=True, with_cell_type_typo=True\n)\ndf\n```\n\n### Step 2: Set up your metadata registries\n\nBefore creating a schema, ensure your registries have the right features and labels:\n\n```{eval-rst}\n.. literalinclude:: scripts/define_mini_immuno_features_labels.py\n   :language: python\n```\n\n### Step 3: Create your schema\n\n```python\nschema = ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible()\nschema.describe()\n```\n\n<!-- #region -->\n\n### Step 4: Initialize Curator and first validation\n\nIf you expect the validation to pass, you can directly register an artifact by providing the schema:\n\n```python\n\nartifact = ln.Artifact.from_dataframe(df, key=\"examples/my_curated_dataset.parquet\", schema=schema).save()\n```\n\n<!-- #endregion -->\n\nThe {meth}`~lamindb.curators.core.Curator.validate` method validates that your dataset adheres to the criteria defined by the `schema`.\nIt identifies which values are already validated (exist in the registries) and which are potentially problematic (do not yet exist in our registries).\n\n```python\ntry:\n    curator = ln.curators.DataFrameCurator(df, schema)\n    curator.validate()\nexcept ln.errors.ValidationError as error:\n    print(error)\n```\n\n### Step 5: Fix validation issues\n\n```python\n# check the non-validated terms\ncurator.cat.non_validated\n```\n\nFor `cell_type_by_expert`, we saw 2 terms are not validated.\n\nFirst, let's standardize synonym \"B-cell\" as suggested\n\n```python\ncurator.cat.standardize(\"cell_type_by_expert\")\n```\n\n```python\n# now we have only one non-validated cell type left\ncurator.cat.non_validated\n```\n\nFor \"CD8-pos alpha-beta T cell\", let's understand which cell type in the public ontology might be the actual match.\n\n```python\n# to check the correct spelling of categories, pass `public=True` to get a lookup object from public ontologies\n# use `lookup = curator.cat.lookup()` to get a lookup object of existing records in your instance\nlookup = curator.cat.lookup(public=True)\nlookup\n```\n\n```python\n# here is an example for the \"cell_type\" column\ncell_types = lookup[\"cell_type_by_expert\"]\ncell_types.cd8_positive_alpha_beta_t_cell\n```\n\n```python\n# fix the cell type name\ndf[\"cell_type_by_expert\"] = df[\"cell_type_by_expert\"].cat.rename_categories(\n    {\"CD8-pos alpha-beta T cell\": cell_types.cd8_positive_alpha_beta_t_cell.name}\n)\n```\n\nFor perturbation, we want to add the new values: \"DMSO\", \"IFNG\"\n\n```python\n# this adds perturbations that were _not_ validated\ncurator.cat.add_new_from(\"perturbation\")\n```\n\n```python\nln.Feature.get(name=\"perturbation\")\n```\n\n```python\n# validate again\ncurator.validate()\n```\n\n### Step 6: Save your curated dataset\n\n```python\nartifact = curator.save_artifact(key=\"examples/my_curated_dataset.parquet\")\n```\n\n```python\nartifact.describe()\n```\n\n## Common fixes\n\nThis section covers the most frequent curation issues and their solutions.\nUse this as a reference when validation fails.\n\n### Feature validation issues\n\n<!-- #region -->\n\n**Issue**: \"Column not in dataframe\"\n\n```\n\"column 'treatment' not in dataframe. Columns in dataframe: ['drug', 'timepoint', ...]\"\n```\n\n**Solutions**:\n\n```python\n# Solution 1: Rename columns to match schema\ndf = df.rename(columns={\n    'treatment': 'drug',\n    'time': 'timepoint',\n    ...\n})\n\n# Solution 2: Create missing columns\ndf['treatment'] = 'unknown'  # Add with default value (or define Feature.default_value)\n\n# Solution 3: Modify schema to match your data\nschema = ln.Schema(\n    features=[\n        ln.Feature.get(name=\"drug\"),  # Use actual column name\n        ln.Feature.get(name=\"timepoint\"),\n    ],\n    ...\n)\n```\n\n<!-- #endregion -->\n\n### Value validation issues\n\n<!-- #region -->\n\n**Issue**: \"Terms not validated in feature 'perturbation'\"\n\n```\n2 terms not validated in feature 'cell_type': 'B-cell', 'CD8-pos alpha-beta T cell'\n    1 synonym found: \"B-cell\" → \"B cell\"\n    → curate synonyms via: .standardize(\"cell_type\")\n    for remaining terms:\n    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('cell_type')\n```\n\n**Solutions**:\n\n```python\n# Solution 1: Use automatic standardization if given hint (handles synonyms))\ncurator.cat.standardize('cell_type')\n\n# Solution 2: Manual mapping for complex cases\nvalue_mapping = {\n    'T-cells': 'T cell',\n    'B-cells': 'B cell',\n}\ndf['cell_type'] = df['cell_type'].map(value_mapping).fillna(df['cell_type'])\n\n# Solution 3: Use public ontology lookup for correct names\nlookup = curator.cat.lookup(public=True)\ncell_types = lookup[\"cell_type\"]\ndf['cell_type'] = df['cell_type'].cat.rename_categories({\n    'CD8-pos T cell': cell_types.cd8_positive_alpha_beta_t_cell.name\n})\n\n# Solution 4: Add new legitimate terms\ncurator.cat.add_new_from(\"cell_type\")\n```\n\n<!-- #endregion -->\n\n### Data type issues\n\n<!-- #region -->\n\n**Issue**: \"Expected categorical data, got object\"\n\n```\nTypeError: Expected categorical data for cell_type, got object\n```\n\n**Solutions**:\n\n```python\n# Solution 1: Convert to categorical\ndf['cell_type'] = df['cell_type'].astype('category')\n\n# Solution 2: Use coercion in feature definition\nln.Feature(name=\"cell_type\", dtype=bt.CellType, coerce=True).save()\n```\n\n<!-- #endregion -->\n\n### Organism-specific ontology issues\n\n<!-- #region -->\n\n**Issue**: \"Terms not validated\" for organism-specific ontologies like developmental stages\n\n```\n2 terms not validated in feature 'developmental_stage_ontology_id': 'MmusDv:0000142', 'MmusDv:0000022'\n```\n\n**Solution**: Specify organism-specific source in feature definition using `cat_filters`:\n\n```python\n# When defining the schema, specify the organism-specific source\nmouse_source = bt.Source.filter(\n    entity=\"bionty.DevelopmentalStage\",\n    organism=\"mouse\"\n).one()\n\nschema = ln.Schema(\n    features=[\n        ln.Feature(\n            name=\"developmental_stage_ontology_id\",\n            dtype=bt.DevelopmentalStage.ontology_id,\n            cat_filters={\"source\": mouse_source}  # Specify organism-specific source\n        )\n    ],\n    ...\n)\n```\n\nThis pattern applies to any ontology where the same registry serves multiple organisms (e.g., `DevelopmentalStage`, `Phenotype`, ...).\n\n<!-- #endregion -->\n\n## External data validation\n\nSince not all metadata is always stored within the dataset itself, it is also possible to validate external metadata.\n\n```{eval-rst}\n.. literalinclude:: scripts/curate_dataframe_external_features.py\n   :language: python\n   :caption: curate_dataframe_external_features.py\n```\n\n```python\n!python scripts/curate_dataframe_external_features.py\n```\n\n## Union dtypes\n\nSome metadata columns might validate against several registries.\n\n```{eval-rst}\n.. literalinclude:: scripts/curate_dataframe_union_features.py\n   :language: python\n   :caption: curate_dataframe_union_features.py\n```\n\n```python\n!python scripts/curate_dataframe_union_features.py\n```\n\n## AnnData\n\n`AnnData` like all other data structures that follow is a composite structure that stores different arrays in different `slots`.\n\n### Allow a flexible schema\n\nWe can also allow a flexible schema for an `AnnData` and only require that it's indexed with Ensembl gene IDs.\n\n```{eval-rst}\n.. literalinclude:: scripts/curate_anndata_flexible.py\n   :language: python\n   :caption: curate_anndata_flexible.py\n```\n\nLet's run the script.\n\n```python\n!python scripts/curate_anndata_flexible.py\n```\n\nUnder-the-hood, this uses the following build-in schema ({func}`~lamindb.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs`):\n\n```{eval-rst}\n.. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py\n   :language: python\n```\n\nThis schema tranposes the `var` DataFrame during curation, so that one validates and annotates the columns of `var.T`, i.e., `[ENSG00000153563, ENSG00000010610, ENSG00000170458]`.\nIf one doesn't transpose, one would annotate the columns of `var`, i.e., `[gene_symbol, gene_type]`.\n\n```{eval-rst}\n.. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/gLyfToATM7WUzkWW0001.png\n    :width: 800px\n```\n\n### Fix validation issues\n\n```python\nadata = ln.examples.datasets.mini_immuno.get_dataset1(\n    with_gene_typo=True, with_cell_type_typo=True, otype=\"AnnData\"\n)\nadata\n```\n\n```python\nschema = ln.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs()\nschema.describe()\n```\n\nCheck the slots of a schema:\n\n```python\nschema.slots\n```\n\n```python\ncurator = ln.curators.AnnDataCurator(adata, schema)\ntry:\n    curator.validate()\nexcept ln.errors.ValidationError as error:\n    print(error)\n```\n\nAs above, we leverage a lookup object with valid cell types to find the correct name.\n\n```python\nvalid_cell_types = curator.slots[\"obs\"].cat.lookup()[\"cell_type_by_expert\"]\nadata.obs[\"cell_type_by_expert\"] = adata.obs[\n    \"cell_type_by_expert\"\n].cat.rename_categories(\n    {\"CD8-pos alpha-beta T cell\": valid_cell_types.cd8_positive_alpha_beta_t_cell.name}\n)\n```\n\nThe validated `AnnData` can be subsequently saved as an {class}`~lamindb.Artifact`:\n\n```python\nadata.obs.columns\n```\n\n```python\ncurator.slots[\"var.T\"].cat.add_new_from(\"columns\")\n```\n\n```python\ncurator.validate()\n```\n\n```python\nartifact = curator.save_artifact(key=\"examples/my_curated_anndata.h5ad\")\n```\n\nAccess the schema for each slot:\n\n```python\nartifact.features.slots\n```\n\nThe saved artifact has been annotated with validated features and labels:\n\n```python\nartifact.describe()\n```\n\n## Unstructured dictionaries\n\nMost datastructures support unstructured metadata stored as dictionaries:\n\n- Pandas DataFrames: `.attrs`\n- AnnData: `.uns`\n- MuData: `.uns` and `modality:uns`\n- SpatialData: `.attrs`\n\nHere, we exemplary show how to curate such metadata for AnnData:\n\n```{eval-rst}\n.. literalinclude:: scripts/define_schema_anndata_uns.py\n   :language: python\n   :caption: define_schema_anndata_uns.py\n```\n\n```python\n!python scripts/define_schema_anndata_uns.py\n```\n\n```{eval-rst}\n.. literalinclude:: scripts/curate_anndata_uns.py\n   :language: python\n   :caption: curate_anndata_uns.py\n```\n\n```python\n!python scripts/curate_anndata_uns.py\n```\n\n## MuData\n\n```{eval-rst}\n.. literalinclude:: scripts/curate_mudata.py\n   :language: python\n   :caption: curate_mudata.py\n```\n\n```python\n!python scripts/curate_mudata.py\n```\n\n## SpatialData\n\n```{eval-rst}\n.. literalinclude:: scripts/define_schema_spatialdata.py\n   :language: python\n   :caption: define_schema_spatialdata.py\n```\n\n```python\n!python scripts/define_schema_spatialdata.py\n```\n\n```{eval-rst}\n.. literalinclude:: scripts/curate_spatialdata.py\n   :language: python\n   :caption: curate_spatialdata.py\n```\n\n```python\n!python scripts/curate_spatialdata.py\n```\n\n## TiledbsomaExperiment\n\n```{eval-rst}\n.. literalinclude:: scripts/curate_soma_experiment.py\n   :language: python\n   :caption: curate_soma_experiment.py\n```\n\n```python\n!python scripts/curate_soma_experiment.py\n```\n\n## Other data structures\n\nIf you have other data structures, read: {doc}`/faq/curate-any`.\n\n```python\n!rm -rf ./test-curate\n!rm -rf ./small_dataset.tiledbsoma\n!lamin delete --force test-curate\n```\n"
  },
  {
    "path": "docs/faq/acid.md",
    "content": "---\nexecute_via: python\n---\n\n# Will data & metadata stay in sync?\n\nHere, we walk through different errors that can occur while saving artifacts & metadata records, and show that the LaminDB instance does not get corrupted by dangling metadata or artifacts.\n\nTransactions within Python across data & metadata are [ACID](https://en.wikipedia.org/wiki/ACID).\n\nIf an upload process is externally killed and Python cannot run clean-up operations anymore, the artifact is internally still flagged with `artifact._storage_ongoing = True`. This is visible on the UI. You can then re-run `lamin save` or `artifact.save()` to attempt uploading the artifact a second time.\n\n```python\n!lamin init --storage ./test-acid\n```\n\n```python\nimport pytest\nimport lamindb as ln\nfrom upath import UPath\n\nln.settings.verbosity = \"debug\"\n```\n\n```python\nopen(\"sample.fasta\", \"w\").write(\">seq1\\nACGT\\n\")\n```\n\n## Save error due to failed upload within Python\n\nLet's try to save an artifact to a storage location without permission.\n\n```python\nartifact = ln.Artifact(\"sample.fasta\", key=\"sample.fasta\")\n```\n\nBecause the public API only allows you to set a default storage for which you have permission, we need to hack it:\n\n```python\nln.settings.storage._root = UPath(\"s3://nf-core-awsmegatests\")\n```\n\nThis raises an exception but nothing gets saved:\n\n```python\nwith pytest.raises(PermissionError) as error:\n    artifact.save()\nprint(error.exconly())\nassert len(ln.Artifact.filter()) == 0\n```\n\n## Save error during bulk creation\n\n```python\nartifacts = [artifact, \"this is not a record\"]\n```\n\nThis raises an exception but nothing gets saved:\n\n```python\nwith pytest.raises(Exception) as error:\n    ln.save(artifacts)\nprint(error.exconly())\nassert len(ln.Artifact.filter()) == 0  # nothing got saved\n```\n\nIf a list of data objects is passed to `ln.save()` and the upload of one of these data objects fails, the successful uploads are maintained and a `RuntimeError` is raised, listing the successfully uploaded data objects up until that point.\n\n## Save error due to externally aborted upload\n\nBack to a proper storage location:\n\n```python\nln.settings.storage._root = UPath(\"./test-acid\").absolute()\n```\n\nThe save operation works:\n\n```python\nartifact.save()\n```\n\nLet's pretend the upload was killed.\n\n```python\nartifact._storage_ongoing = True\nartifact.save()\nartifact.path.unlink()\nassert artifact._aux == {\"so\": 1}  # storage/upload is ongoing\n```\n\nWe can re-run it:\n\n```python\nartifact = ln.Artifact(\"sample.fasta\", key=\"sample.fasta\").save()\n```\n\n```python\nassert not artifact._storage_ongoing\nassert artifact._aux is None\n```\n\n```python\n!rm -r ./test-acid\n!lamin delete --force test-acid\n```\n"
  },
  {
    "path": "docs/faq/curate-any.md",
    "content": "---\nexecute_via: python\n---\n\n# How do I validate & annotate arbitrary data structures?\n\nThis guide walks through the low-level API that lets you validate iterables.\n\nYou can then use the records create inferred during validation to annotate a dataset.\n\n:::{dropdown} How do I validate based on a public ontology?\n\nLaminDB makes it easy to validate categorical variables based on registries that inherit from {class}`~lamindb.models.CanCurate`.\n\n{class}`~lamindb.models.CanCurate` methods validate against the registries in your LaminDB instance.\nIn {doc}`/manage-ontologies`, you'll see how to extend standard validation to validation against _public references_ using a `PubliOntology` object, e.g., via `public_genes = bt.Gene.public()`.\nBy default, {meth}`~lamindb.models.CanCurate.from_values` considers a match in a public reference a validated value for any {mod}`bionty` entity.\n\n:::\n\n```python\n# pip install 'lamindb[zarr]'\n!lamin init --storage ./test-curate-any --modules bionty\n```\n\nDefine a test dataset.\n\n```python\nimport lamindb as ln\nimport bionty as bt\nimport zarr\nimport numpy as np\n\ndata = zarr.open_group(store=\"data.zarr\", mode=\"a\")\n\ndata.create_dataset(name=\"temperature\", shape=(3,), dtype=\"float32\")\ndata.create_dataset(name=\"knockout_gene\", shape=(3,), dtype=str)\ndata.create_dataset(name=\"disease\", shape=(3,), dtype=str)\n\ndata[\"knockout_gene\"][:] = np.array(\n    [\"ENSG00000139618\", \"ENSG00000141510\", \"ENSG00000133703\"]\n)\ndata[\"disease\"][:] = np.random.default_rng().choice(\n    [\"MONDO:0004975\", \"MONDO:0004980\"], 3\n)\n```\n\n## Validate and standardize vectors\n\nRead the `disease` array from the zarr group into memory.\n\n```python\ndisease = data[\"disease\"][:]\n```\n\n{meth}`~lamindb.models.CanCurate.validate` validates vectore-like values against reference values in a registry.\nIt returns a boolean vector indicating where a value has an exact match in the reference values.\n\n```python\nbt.Disease.validate(disease, field=bt.Disease.ontology_id)\n```\n\nWhen validation fails, you can call {meth}`~lamindb.models.CanCurate.inspect` to figure out what to do.\n\n{meth}`~lamindb.models.CanCurate.inspect` applies the same definition of validation as {meth}`~lamindb.models.CanCurate.validate`, but returns a rich return value {class}`~lamindb.models.InspectResult`. Most importantly, it logs recommended curation steps that would render the data validated.\n\nNote: you can use {meth}`~lamindb.models.CanCurate.standardize` to standardize synonyms.\n\n```python\nbt.Disease.inspect(disease, field=bt.Disease.ontology_id)\n```\n\nBulk creating records using {meth}`~lamindb.models.CanCurate.from_values` only returns validated records.\n\n```python\ndiseases = bt.Disease.from_values(disease, field=bt.Disease.ontology_id).save()\n```\n\nRepeat the process for more labels:\n\n```python\nexperiments = ln.Record.from_values(\n    [\"Experiment A\", \"Experiment B\"],\n    field=ln.Record.name,\n    create=True,  # create non-validated labels\n).save()\ngenes = bt.Gene.from_values(\n    data[\"knockout_gene\"][:], field=bt.Gene.ensembl_gene_id\n).save()\n```\n\n## Annotate the dataset\n\nRegister the dataset as an artifact:\n\n```python\nartifact = ln.Artifact(\"data.zarr\", key=\"my_dataset.zarr\").save()\n```\n\nAnnotate with features:\n\n```python\nln.Feature(name=\"experiment\", dtype=ln.Record).save()\nln.Feature(name=\"disease\", dtype=bt.Disease.ontology_id).save()\nln.Feature(name=\"knockout_gene\", dtype=bt.Gene.ensembl_gene_id).save()\nartifact.features.set_values(\n    {\"experiment\": experiments, \"knockout_gene\": genes, \"disease\": diseases}\n)\nartifact.describe()\n```\n\n```python\n# clean up test instance\n!rm -r data.zarr\n!rm -r ./test-curate-any\n!lamin delete --force test-curate-any\n```\n"
  },
  {
    "path": "docs/faq/idempotency.md",
    "content": "---\nexecute_via: python\n---\n\n# Will data get duplicated upon re-running code?\n\nLaminDB's operations are idempotent in the sense defined here, which allows you to re-run code without duplicating data.\n\n:::{admonition} SQLRecords with `name` field\n\nWhen you instantiate {class}`~lamindb.models.SQLRecord` with a name, in case a name has an _exact match_ in a registry, the constructor returns it instead of creating a new record. In case records with _similar names_ exist, you'll see them in a table: you can then decide whether you want to save the new record or pick an existing record.\n\nIf you set {attr}`~lamindb.core.subsettings.CreationSettings.search_names` to `False`, you bypass these checks.\n\n:::\n\n:::{admonition} Artifacts & collections\n\nIf you instantiate {class}`~lamindb.Artifact` from data that already exists as an artifact, the `Artifact()` constructor returns the existing artifact based on a hash lookup.\n\n:::\n\n```python\n# pip install lamindb\n!lamin init --storage ./test-idempotency\n```\n\n```python\nimport lamindb as ln\n\nln.track(\"ANW20Fr4eZgM0000\")\n```\n\n## SQLRecords with name field\n\n```python\nassert ln.settings.creation.search_names\n```\n\nLet us add a first record to the {class}`~lamindb.Record` registry:\n\n```python\nlabel = ln.Record(name=\"My label 1\").save()\n```\n\nIf we create a new record, we'll automatically get search results that give clues on whether we are prone to duplicating an entry:\n\n```python\nlabel = ln.Record(name=\"My label 1a\")\n```\n\nLet's save the `1a` label, we actually intend to create it.\n\n```python\nlabel.save()\n```\n\nIn case we match an existing name directly, we'll get the existing object:\n\n```python\nlabel = ln.Record(name=\"My label 1\")\n```\n\nIf we save it again, it will not create a new entry in the registry:\n\n```python\nlabel.save()\n```\n\nNow, if we create a third record, we'll get two alternatives:\n\n```python\nlabel = ln.Record(name=\"My label 1b\")\n```\n\nIf we prefer to not perform a search, e.g. for performance reasons, we can switch it off.\n\n```python\nln.settings.creation.search_names = False\nlabel = ln.Record(name=\"My label 1c\")\n```\n\nSwitch it back on:\n\n```python\nln.settings.creation.search_names = True\n```\n\n## Artifacts & collections\n\n```python\nfilepath = ln.examples.datasets.file_fcs()\n```\n\nCreate an `Artifact`:\n\n```python\nartifact = ln.Artifact(filepath, key=\"my_fcs_file.fcs\").save()\n```\n\n```python\nassert artifact.hash == \"rCPvmZB19xs4zHZ7p_-Wrg\"\nassert artifact.run == ln.context.run\nassert not artifact.recreating_runs.exists()\n```\n\nCreate an `Artifact` from the same path:\n\n```python\nartifact2 = ln.Artifact(filepath, key=\"my_fcs_file.fcs\")\n```\n\nIt gives us the existing object:\n\n```python\nassert artifact.id == artifact2.id\nassert artifact.run == artifact2.run\nassert not artifact.recreating_runs.exists()\n```\n\nIf you save it again, nothing will happen (the operation is idempotent):\n\n```python\nartifact2.save()\n```\n\nIn the hidden cell below, you'll see how this interplays with data lineage.\n\n```python\nln.track(new_run=True)\nartifact3 = ln.Artifact(filepath, key=\"my_fcs_file.fcs\")\nassert artifact3.id == artifact2.id\nassert artifact3.run == artifact2.run != ln.context.run  # run is not updated\nassert artifact2.recreating_runs.first() == ln.context.run\n```\n\n```python\n!rm -rf ./test-idempotency\n!lamin delete --force test-idempotency\n```\n"
  },
  {
    "path": "docs/faq/import-modules.md",
    "content": "---\nexecute_via: python\n---\n\n# What happens if I import a schema module without lamindb?\n\n```python\n# !pip install 'lamindb[bionty]'\n!lamin init --storage testmodule --modules bionty\n```\n\nUpon `import`, nothing yet happens:\n\n```python\nimport bionty as bt\n```\n\nIf you try to access an attribute (other than `model`), you'll load the instance in the same way as calling `import lamindb`.\n\nUnder the hood, `lamindb` is imported!\n\n```python\nassert bt.Organism(name=\"human\") is not None\n```\n\n```python\n!lamin delete --force testmodule\n```\n"
  },
  {
    "path": "docs/faq/keep-artifacts-local.md",
    "content": "---\nexecute_via: python\n---\n\n# Keep artifacts local in a cloud instance\n\nIf you want to default to keeping artifacts local in a cloud instance, enable {attr}`~lamindb.setup.core.InstanceSettings.keep_artifacts_local`.\n\nLet us first create a cloud instance that woul store artifacts exclusively on S3.\n\n```python\n!lamin login testuser1\n!lamin init --storage s3://lamindb-ci/keep-artifacts-local\n```\n\nLet's import lamindb and track the current notebook run.\n\n```python\n# pip install lamindb\nimport lamindb as ln\n\nln.track(\"l9lFf83aPwRc\")\n```\n\n## Toggling setting \"keep artifacts local\"\n\nYou can checkmark the \"Keep artifacts local\" box on the instance settings tab.\n\n<img src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/6Kt20kV5sQIFyV0Q0000.png\" width=\"400px\">\n\nOr toggle it through the following instance setting.\n\n```python\nln.setup.settings.instance.keep_artifacts_local = True\n```\n\n## Create a local storage location\n\nCall the following for a -- potentially pre-existing -- root path and a unique host identifier.\n\n```python\nln.Storage(root=\"./our_local_storage\", host=\"abc-institute-drive1\").save()\n```\n\nNow, you have two storage locations: one in the S3 bucket, and the other locally.\n\n```python\nln.Storage.to_dataframe()\n```\n\nYou can now set it as a local default storage location.\nNext time you connect to the instance, this won't be necessary and the location will be automatically detected as the local default.\n\n```python\nln.settings.local_storage = \"./our_local_storage\"\n```\n\n## Use a local storage location\n\nIf you save an artifact in keep-artifacts-local mode, by default, it's stored in local storage.\n\n```python\noriginal_filepath = ln.examples.datasets.file_fcs()\nartifact = ln.Artifact(original_filepath, key=\"example_datasets/file1.fcs\").save()\nlocal_path = artifact.path  # local storage path\nlocal_path\n```\n\nYou'll see the `.fcs` file named by the `uid` in your `.lamindb/` directory under `./our_local_storage/`:\n\n```python\nassert artifact.path.exists()\nassert artifact.path.as_posix().startswith(ln.settings.local_storage.root.as_posix())\nln.settings.local_storage.root.view_tree()\n```\n\n## Pre-existing artifacts\n\nAssume you already have a file in your local storage location:\n\n```python\nfile_in_local_storage = ln.examples.datasets.file_bam()\nfile_in_local_storage.rename(\"./our_local_storage/output.bam\")\nln.UPath(\"our_local_storage/\").view_tree()\n```\n\nWhen registering an artifact for it, it remains where it is.\n\n```python\nmy_existing_file = ln.Artifact(\"./our_local_storage/output.bam\").save()\nln.UPath(\"our_local_storage/\").view_tree()\n```\n\nThe storage path of the artifact matches the pre-existing file:\n\n```python\nmy_existing_file.path\n```\n\n## Switching between local storage locations\n\nYou might have several local storage locations. Here is how you can switch between them.\n\n```python\nln.Storage(root=\"./our_local_storage2\", host=\"abc-institute-drive1\").save()\nln.settings.local_storage = \"./our_local_storage2\"  # switch to the new storage location\n```\n\nIngest a file into the new local storage location.\n\n```python\nfilepath = ln.examples.datasets.file_fastq()\nartifact3 = ln.Artifact(filepath, key=\"example_datasets/file.fastq.gz\").save()\n```\n\nInspect where all the files are.\n\n```python\nln.Artifact.to_dataframe(include=[\"storage__root\", \"storage__region\"])\n```\n\n## Upload a local artifact to the cloud\n\nIf you'd like to upload an artifact to the cloud storage location to more easily share it or view it through web applications, you pass `upload=True` to the `save()` method.\n\n```python\nartifact.save(upload=True)\n```\n\nYou now see the artifact in the S3 bucket:\n\n```python\nln.settings.storage.root.view_tree()\n```\n\nAnd it's no longer present in local storage:\n\n```python\nassert artifact.path.exists()\nassert not local_path.exists()\nassert artifact.path.as_posix().startswith(ln.settings.storage.root.as_posix())\nln.settings.local_storage.root.view_tree()\n```\n\n## Upload directly to the cloud\n\nYou can also directly upload via `upload=True`:\n\n```python\nfilepath = ln.examples.datasets.file_mini_csv()\nartifact2 = ln.Artifact(filepath, key=\"example_datasets/mini.csv\").save(upload=True)\nartifact2.path\n```\n\nNow we have two files on S3:\n\n```python\nln.Artifact.to_dataframe(include=\"storage__root\")\n```\n\n## Update storage description\n\nYou can add a description to the storage location by using the `description` field.\n\n```python\nstorage_record = ln.Storage.get(root__endswith=\"our_local_storage\")\nstorage_record.description = \"Our shared directory for project X\"\nstorage_record.save()\nln.Storage.to_dataframe()\n```\n\n## Delete the test instance\n\nDelete the artifacts:\n\n```python\nartifact.delete(permanent=True)\nartifact2.delete(permanent=True)\nartifact3.delete(permanent=True)\nmy_existing_file.delete(permanent=True, storage=False)\n```\n\nDelete the instance:\n\n```python\nln.setup.delete(\"keep-artifacts-local\", force=True)\n```\n"
  },
  {
    "path": "docs/faq/pydantic-pandera.md",
    "content": "---\nexecute_via: python\n---\n\n# Pydantic & Pandera vs. LaminDB\n\nThis doc explains conceptual differences between data validation with `pydantic`, `pandera`, and `LaminDB`.\n\n```python\n!lamin init --storage test-pydantic-pandera --modules bionty\n```\n\nLet us work with a test dataframe.\n\n```python\nimport pandas as pd\nimport pydantic\nimport lamindb as ln\nimport bionty as bt\nimport pandera.pandas as pandera\nimport pprint\n\nfrom typing import Literal, Any\n\ndf = ln.examples.datasets.mini_immuno.get_dataset1()\ndf\n```\n\n## Define a schema\n\n### pydantic\n\n```python\nPerturbation = Literal[\"DMSO\", \"IFNG\"]\nCellType = Literal[\"T cell\", \"B cell\"]\nOntologyID = Literal[\"EFO:0008913\"]\n\n\nclass ImmunoSchema(pydantic.BaseModel):\n    perturbation: Perturbation\n    cell_type_by_model: CellType\n    cell_type_by_expert: CellType\n    assay_oid: OntologyID\n    concentration: str\n    treatment_time_h: int\n    donor: str | None\n\n    class Config:\n        title = \"My immuno schema\"\n```\n\n### pandera\n\n```python\npandera_schema = pandera.DataFrameSchema(\n    {\n        \"perturbation\": pandera.Column(\n            str, checks=pandera.Check.isin([\"DMSO\", \"IFNG\"])\n        ),\n        \"cell_type_by_model\": pandera.Column(\n            str, checks=pandera.Check.isin([\"T cell\", \"B cell\"])\n        ),\n        \"cell_type_by_expert\": pandera.Column(\n            str, checks=pandera.Check.isin([\"T cell\", \"B cell\"])\n        ),\n        \"assay_oid\": pandera.Column(str, checks=pandera.Check.isin([\"EFO:0008913\"])),\n        \"concentration\": pandera.Column(str),\n        \"treatment_time_h\": pandera.Column(int),\n        \"donor\": pandera.Column(str, nullable=True),\n    },\n    name=\"My immuno schema\",\n)\n```\n\n### LaminDB\n\nFeatures & labels are defined on the level of the database instance.\nYou can either define a schema with required (and optional) columns.\n\n```python\nln.Record(name=\"DMSO\").save()\nln.Record(name=\"IFNG\").save()\n\n# leverage ontologies through types ln.Record, bt.CellType, bt.ExperimentalFactor\nlamindb_schema = ln.Schema(\n    name=\"My immuno schema\",\n    features=[\n        ln.Feature(name=\"perturbation\", dtype=ln.Record).save(),\n        ln.Feature(name=\"cell_type_by_model\", dtype=bt.CellType).save(),\n        ln.Feature(name=\"cell_type_by_expert\", dtype=bt.CellType).save(),\n        ln.Feature(name=\"assay_oid\", dtype=bt.ExperimentalFactor.ontology_id).save(),\n        ln.Feature(name=\"concentration\", dtype=str).save(),\n        ln.Feature(name=\"treatment_time_h\", dtype=int).save(),\n        ln.Feature(name=\"donor\", dtype=str, nullable=True).save(),\n    ],\n).save()\n```\n\nOr merely define a constraint on the feature identifier.\n\n```python\nlamindb_schema_only_itype = ln.Schema(\n    name=\"Allow any valid features & labels\", itype=ln.Feature\n)\n```\n\n## Validate a dataframe\n\n### pydantic\n\n```python\nclass DataFrameValidationError(Exception):\n    pass\n\n\ndef validate_dataframe(df: pd.DataFrame, model: type[pydantic.BaseModel]):\n    errors = []\n\n    for i, row in enumerate(df.to_dict(orient=\"records\")):\n        try:\n            model(**row)\n        except pydantic.ValidationError as e:\n            errors.append(f\"row {i} failed validation: {e}\")\n\n    if errors:\n        error_message = \"\\n\".join(errors)\n        raise DataFrameValidationError(\n            f\"DataFrame validation failed with the following errors:\\n{error_message}\"\n        )\n```\n\n```python\ntry:\n    validate_dataframe(df, ImmunoSchema)\nexcept DataFrameValidationError as e:\n    print(e)\n```\n\nTo fix the validation error, we need to update the `Literal` and re-run the model definition.\n\n```python\nPerturbation = Literal[\"DMSO\", \"IFNG\"]\nCellType = Literal[\n    \"T cell\", \"B cell\", \"CD8-positive, alpha-beta T cell\"  # <-- updated\n]\nOntologyID = Literal[\"EFO:0008913\"]\n\n\nclass ImmunoSchema(pydantic.BaseModel):\n    perturbation: Perturbation\n    cell_type_by_model: CellType\n    cell_type_by_expert: CellType\n    assay_oid: OntologyID\n    concentration: str\n    treatment_time_h: int\n    donor: str | None\n\n    class Config:\n        title = \"My immuno schema\"\n```\n\n```python\nvalidate_dataframe(df, ImmunoSchema)\n```\n\n### pandera\n\n```python\ntry:\n    pandera_schema.validate(df)\nexcept pandera.errors.SchemaError as e:\n    print(e)\n```\n\n### LaminDB\n\nBecause the term `\"CD8-positive, alpha-beta T cell\"` is part of the public `CellType` ontology, validation passes the first time.\n\nIf validation had not passed, we could have resolved the issue simply by adding a new term to the `CellType` registry rather than editing the code.\nThis also puts downstream data scientists into a position to update ontologies.\n\n```python\ncurator = ln.curators.DataFrameCurator(df, lamindb_schema)\ncurator.validate()\n```\n\nWhat was the cell type validation based on? Let's inspect the `CellType` registry.\n\n```python\nbt.CellType.to_dataframe()\n```\n\nThe `CellType` regsitry is hierachical as it contains the Cell Ontology.\n\n```python\nbt.CellType.get(name=\"CD8-positive, alpha-beta T cell\").view_parents()\n```\n\n## Overview of validation properties\n\nImportantly, LaminDB offers not only a `DataFrameCurator`, but also a `AnnDataCurator`, `MuDataCurator`, `SpatialDataCurator`, and `TiledbsomaCurator`.\n\nThe below overview only concerns validating dataframes.\n\n### Experience of data engineer\n\n| property                                                                                                                       | `pydantic`                                            | `pandera`                                             | `lamindb`                                                                             |\n| ------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------- | ----------------------------------------------------- | ------------------------------------------------------------------------------------- |\n| define schema as code                                                                                                          | yes, in form of a `pydantic.BaseModel`                | yes, in form of a `pandera.DataFrameSchema`           | yes, in form of a `lamindb.Schema`                                                    |\n| define schema as a set of constraints without the need of listing fields/columns/features; e.g. useful if validating 60k genes | no                                                    | no                                                    | yes                                                                                   |\n| update labels independent of code                                                                                              | not possible because labels are enums/literals        | not possible because labels are hard-coded in `Check` | possible by adding new terms to a registry                                            |\n| built-in validation from public ontologies                                                                                     | no                                                    | no                                                    | yes                                                                                   |\n| sync labels with ELN/LIMS registries without code change                                                                       | no                                                    | no                                                    | yes                                                                                   |\n| can re-use fields/columns/features across schemas                                                                              | limited via subclass                                  | only in same Python session                           | yes because persisted in database                                                     |\n| schema modifications can invalidate previously validated datasets                                                              | yes                                                   | yes                                                   | no because LaminDB allows to query datasets that were validated with a schema version |\n| can use columnar organization of dataframe                                                                                     | no, need to iterate over potentially millions of rows | yes                                                   | yes                                                                                   |\n\n### Experience of data consumer\n\n| property                                    | `pydantic`                                                                    | `pandera`             | `lamindb`                              |\n| ------------------------------------------- | ----------------------------------------------------------------------------- | --------------------- | -------------------------------------- |\n| dataset is queryable / findable             | no                                                                            | no                    | yes, by querying for labels & features |\n| dataset is annotated                        | no                                                                            | no                    | yes                                    |\n| user knows what validation constraints were | no, because might not have access to code and doesn't know which code was run | no (same as pydantic) | yes, via `artifact.schema`             |\n\n## Annotation & queryability\n\n### Engineer: annotate the dataset\n\nEither use the `Curator` object:\n\n```python\nartifact = curator.save_artifact(key=\"our_datasets/dataset1.parquet\")\n```\n\nIf you don't expect a need for Curator functionality for updating ontologies and standardization, you can also use the `Artifact` constructor.\n\n```python\nartifact = ln.Artifact.from_dataframe(\n    df, key=\"our_datasets/dataset1.parquet\", schema=lamindb_schema\n).save()\n```\n\n### Consumer: see annotations\n\n```python\nartifact.describe()\n```\n\n### Consumer: query the dataset\n\n```python\nln.Artifact.filter(perturbation=\"IFNG\").to_dataframe()\n```\n\n### Consumer: understand validation\n\nBy accessing `artifact.schema`, the consumer can understand _how_ the dataset was validated.\n\n```python\nartifact.schema\n```\n\n```python\nartifact.schema.features.to_dataframe()\n```\n\n## Nested data with dynamic keys\n\nWe will now examine another more complex example where data is nested with potentially arbitrary (dynamic) keys.\nThe example is inspired by the [CELLxGENE schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/6.0.0/schema.md#uns-dataset-metadata) where annotations are stored as dictionaries in the AnnData `.uns` slot.\n\n```python\nuns_dict = ln.examples.datasets.dict_cellxgene_uns()\npprint.pprint(uns_dict)\n```\n\n### pydantic\n\nPydantic is primed to deal with nested data.\n\n```python\nclass Images(pydantic.BaseModel):\n    fullres: str\n    hires: str\n\n\nclass Scalefactors(pydantic.BaseModel):\n    spot_diameter_fullres: float\n    tissue_hires_scalef: float\n\n\nclass Library(pydantic.BaseModel):\n    images: Images\n    scalefactors: Scalefactors\n\n\nclass Spatial(pydantic.BaseModel):\n    is_single: bool\n    model_config = {\"extra\": \"allow\"}\n\n    def __init__(self, **data):\n        libraries = {}\n        other_fields = {}\n\n        # store all libraries under a single key for validation\n        for key, value in data.items():\n            if key.startswith(\"library_\"):\n                libraries[key] = Library(**value)\n            else:\n                other_fields[key] = value\n\n        other_fields[\"libraries\"] = libraries\n        super().__init__(**other_fields)\n\n\nclass SpatialDataSchema(pydantic.BaseModel):\n    organism_ontology_term_id: str\n    spatial: Spatial\n\n\nvalidated_data = SpatialDataSchema(**uns_dict)\n```\n\nHowever, pydantic either requires all dictionary keys to be known beforehand to construct the Model classes or workarounds to collect all keys for a single model.\n\n### pandera\n\nPandera cannot validate dictionaries because it is designed for structured dataframe data.\nTherefore, we need to flatten the dictionary to transform it into a DataFrame:\n\n```python\ndef _flatten_dict(d: dict[Any, Any], parent_key: str = \"\", sep: str = \"_\"):\n    items = []\n    for k, v in d.items():\n        new_key = f\"{parent_key}{sep}{k}\" if parent_key else k\n        if isinstance(v, dict):\n            items.extend(_flatten_dict(v, new_key, sep=sep).items())\n        else:\n            items.append((new_key, v))\n    return dict(items)\n```\n\n```python\ndef create_dynamic_schema(flattened_data: dict[str, Any]):\n    schema_dict = {\n        \"organism_ontology_term_id\": pandera.Column(str),\n        \"spatial_is_single\": pandera.Column(bool),\n    }\n\n    for key in flattened_data.keys():\n        if key.startswith(\"spatial_library_\") and key.endswith(\"_images_fullres\"):\n            lib_prefix = key.replace(\"_images_fullres\", \"\")\n            schema_dict.update(\n                {\n                    f\"{lib_prefix}_images_fullres\": pandera.Column(str),\n                    f\"{lib_prefix}_images_hires\": pandera.Column(str),\n                    f\"{lib_prefix}_scalefactors_spot_diameter_fullres\": pandera.Column(\n                        float\n                    ),\n                    f\"{lib_prefix}_scalefactors_tissue_hires_scalef\": pandera.Column(\n                        float\n                    ),\n                }\n            )\n\n    return pandera.DataFrameSchema(schema_dict)\n\n\nflattened = _flatten_dict(uns_dict)\ndf = pd.DataFrame([flattened])\nspatial_schema = create_dynamic_schema(flattened)\nvalidated_df = spatial_schema.validate(df)\n```\n\nAnalogously to pydantic, pandera does not have out of the box support for dynamically named keys.\nTherefore, it is necessary to dynamically construct a pydantic schema.\n\n### LaminDB\n\nSimilarly, LaminDB currently requires constructing flattened dataframes to dynamically create features for the schema, which can then be used for validation with the DataFrameCurator.\nFuture improvements are expected, including support for a dictionary-specific curator.\n\n```python\ndef create_dynamic_schema(flattened_data: dict[str, Any]) -> ln.Schema:\n    features = []\n\n    for key, value in flattened_data.items():\n        if key == \"organism_ontology_term_id\":\n            features.append(ln.Feature(name=key, dtype=bt.Organism.ontology_id).save())\n        elif isinstance(value, bool):\n            features.append(ln.Feature(name=key, dtype=bool).save())\n        elif isinstance(value, (int, float)):\n            features.append(ln.Feature(name=key, dtype=float).save())\n        else:\n            features.append(ln.Feature(name=key, dtype=str).save())\n\n    return ln.Schema(name=\"Spatial data schema\", features=features, coerce=True).save()\n\n\nflattened = _flatten_dict(uns_dict)\nflattened_df = pd.DataFrame([flattened])\nspatial_schema = create_dynamic_schema(flattened)\ncurator = ln.curators.DataFrameCurator(flattened_df, spatial_schema)\ncurator.validate()\n```\n\n```{note}\nCurators for scverse data structures allow for the specification of schema slots that access and validate dataframes in nested dictionary attributes like `.attrs` or `.uns`.\nThese schema slots use colon-separated paths like `'attrs:sample'` or `'uns:spatial:images'` to target specific dataframes for validation.\n```\n"
  },
  {
    "path": "docs/faq/reference-field.md",
    "content": "---\nexecute_via: python\n---\n\n# Where to store external links and IDs?\n\nWhen registering data in LaminDB, you might want to store a reference link or ID to indicate the source of the collection.\n\nWe have `reference` and `reference_type` fields for this purpose, they are available for {class}`~lamindb.Collection`, {class}`~lamindb.Transform`, {class}`~lamindb.Run` and {class}`~lamindb.Record`.\n\n```python\n# !pip install lamindb\n!lamin init --storage testreference\n```\n\n```python\nimport lamindb as ln\n```\n\nLet's say we have a few donor samples that came form Vendor X, in order to chase back the orders, I'd like to keep track the donor ids provided by the vendor:\n\n```python\nln.Record(\n    name=\"donor 001\", reference=\"VX984545\", reference_type=\"Donor ID from Vendor X\"\n)\n```\n\n```python\n!lamin delete --force testreference\n```\n"
  },
  {
    "path": "docs/faq/search.md",
    "content": "---\nexecute_via: python\n---\n\n# How does search work?\n\n```python\nfrom laminci.db import setup_local_test_postgres\n\npgurl = setup_local_test_postgres()\n!lamin init --name benchmark_search --db {pgurl} --modules bionty --storage ./benchmark_search\n```\n\nHere we show how to perform text search on `SQLRecord` and evaluate some search queries for the {class}`bionty.CellType` ontology.\n\n```python\nimport lamindb as ln\nimport bionty as bt\n\nSEARCH_QUERIES_EXACT = (\n    \"t cell\",\n    \"stem cell\",\n    \"b cell\",\n    \"regulatory B cell\",\n    \"Be2 cell\",\n    \"adipocyte\",\n)\nSEARCH_QUERIES_CONTAINS = (\"t cel\", \"t-cel\", \"neural\", \"kidney\", \"kidne\")\nTOP_N = 20\n\nbt.CellType.import_source()\n```\n\n```python\nln.Record(name=\"cat[*_*]\").save()\n```\n\n## Search the registry\n\n```python\nfor query in SEARCH_QUERIES_EXACT:\n    print(\"Query:\", query)\n    qs = bt.CellType.search(query)\n    display(qs.to_dataframe())\n\n    assert query.lower() == qs[0].name.lower()\n```\n\n```python\nfor query in SEARCH_QUERIES_CONTAINS:\n    print(\"Query:\", query)\n    qs = bt.CellType.search(query)\n    display(qs.to_dataframe())\n\n    top_record = qs[0]\n    query = query.lower()\n    assert query in top_record.name.lower() or query in top_record.synonyms.lower()\n```\n\nCheck escaping of special characters.\n\n```python\nassert len(ln.Record.search(\"cat[\")) == 1\n```\n\n```python\nassert len(ln.Record.search(\"*_*\")) == 1\n```\n\n## Search the public ontology\n\n```python\nct_public = bt.CellType.public()\n\ndf = ct_public.search(\"b cell\", limit=20)\nassert df.iloc[0][\"name\"] == \"B cell\"\ndf\n```\n\n```python\n!docker stop pgtest && docker rm pgtest\n!lamin delete --force benchmark_search\n```\n"
  },
  {
    "path": "docs/faq/symbol-mapping.md",
    "content": "---\nexecute_via: python\n---\n\n# Why should I not index datasets with gene symbols?\n\nGene symbols are widely used for readability, particularly for visualization. However, indexing datasets with gene symbols presents challenges:\n\n- A single gene may have multiple symbols or aliases.\n- Gene symbols change over time (e.g., _BRCA2_ was once _FACD_) without version tracking.\n- The same symbol can represent different genes across species.\n- Symbols may be misinterpreted by software (e.g., _SEPT9_ as \"September 9\" in Excel).\n- Formatting inconsistencies exist (e.g., case sensitivity, special characters).\n\nUsing unique identifiers like ENSEMBL gene IDs addresses these issues by providing:\n\n- A direct, stable mapping to genomic coordinates.\n- Consistency across databases.\n- Species-specific prefixes to prevent cross-species confusion.\n- Unique, permanent identifiers with standardized formatting.\n\nStoring ENSEMBL gene IDs alongside gene symbols offers readability for visualization while maintaining robust data integrity. During curation, validating against ENSEMBL gene IDs ensures accurate mapping.\n\nIf only symbols are available for a dataset, you can map them to ENSEMBL IDs using {meth}`~bionty.Gene.standardize`.\n\n```python\n# !pip install 'lamindb[bionty]'\n!lamin init --storage test-symbols --modules bionty\n```\n\n```python\nimport lamindb as ln\nimport bionty as bt\nimport numpy as np\nimport pandas as pd\nimport anndata as ad\n\n# create example AnnData object with gene symbols\nrng = np.random.default_rng(42)\nX = rng.integers(0, 100, size=(5, 10))\nvar = pd.DataFrame(\n    index=pd.Index(\n        [\n            \"BRCA1\",\n            \"TP53\",\n            \"EGFR\",\n            \"KRAS\",\n            \"PTEN\",\n            \"MYC\",\n            \"VEGFA\",\n            \"IL6\",\n            \"TNF\",\n            \"GAPDH\",\n        ],\n        name=\"symbol\",\n    )\n)\nadata = ad.AnnData(X=X, var=var)\nadata.var\n```\n\n```python\n# map Gene symbols to ENSEMBL IDs\ngene_mapper = bt.Gene.standardize(\n    adata.var.index,\n    field=bt.Gene.symbol,\n    return_field=bt.Gene.ensembl_gene_id,\n    return_mapper=True,\n    organism=\"human\",\n)\nadata.var[\"ensembl_id\"] = adata.var.index.map(\n    lambda gene_id: gene_mapper.get(gene_id, gene_id)\n)\nadata.var\n```\n\n```python\nstandardized_genes = bt.Gene.from_values(\n    [\n        \"ENSG00000141510\",\n        \"ENSG00000133703\",\n        \"ENSG00000111640\",\n        \"ENSG00000171862\",\n        \"ENSG00000204490\",\n        \"ENSG00000112715\",\n        \"ENSG00000146648\",\n        \"ENSG00000136997\",\n        \"ENSG00000012048\",\n        \"ENSG00000136244\",\n    ],\n    field=bt.Gene.ensembl_gene_id,\n    organism=\"human\",\n)\nln.save(standardized_genes)\n```\n\nThis allows for validating the the `ensembl_id` against the `Gene` registry using the `bt.Gene.ensembl_gene_id` field.\n\n```python\nbt.Gene.validate(adata.var[\"ensembl_id\"], field=bt.Gene.ensembl_gene_id)\n```\n\n```{note}\nGene symbols do not map one-to-one with ENSEMBL IDs. A single gene symbol may correspond to multiple ENSEMBL IDs due to:\n\n1. **Gene Paralogs**: Similar symbols can be shared among paralogous genes within the same species, resulting in one symbol linking to multiple ENSEMBL IDs.\n2. **Pseudogenes**: Some symbols represent both functional genes and their non-functional pseudogenes, each with distinct ENSEMBL IDs.\n3. **Transcript Variants**: One symbol may map to multiple ENSEMBL transcript IDs, each representing different isoforms or splice variants.\n\n{meth}`~bionty.Gene.standardize` retrieves the first match in cases of multiple hits, which is generally sufficient but not perfectly accurate.\n```\n\n```python\n!lamin delete --force test-symbols\n```\n"
  },
  {
    "path": "docs/faq/test_notebooks.py",
    "content": "from pathlib import Path\n\nimport nbproject_test as test\n\nimport lamindb as ln\n\n\ndef test_notebooks():\n    nbdir = Path(__file__).parent\n    ln.setup.login(\"testuser1\")\n    ln.setup.init(storage=nbdir / \"mydata\")\n    test.execute_notebooks(nbdir, write=True)\n"
  },
  {
    "path": "docs/faq/track-run-inputs.md",
    "content": "---\nexecute_via: python\n---\n\n# Can I disable tracking run inputs?\n\nYes, if you switch {attr}`~lamindb.core.Settings.track_run_inputs` to `False`.\n\n```python\n# pip install lamindb\n!lamin init --storage test-run-inputs\n```\n\n```python\nimport lamindb as ln\n```\n\nSome test artifacts:\n\n```python\nln.track(transform=ln.Transform(key=\"Dummpy pipeline\"))\nln.Artifact(ln.examples.datasets.file_jpg_paradisi05(), description=\"My image\").save()\nln.Artifact(ln.examples.datasets.file_mini_csv(), description=\"My csv\").save()\n```\n\nCall `ln.track()`:\n\n```python\nln.track(\"Rx2s9aPTMQLY0000\")\n```\n\n## Don't track artifact as run input\n\n```python\nln.settings.track_run_inputs = False\n```\n\n```python\nartifact = ln.Artifact.get(description=\"My image\")\n```\n\n```python\nartifact.cache()\n```\n\nNo run inputs are linked to the current notebook run:\n\n```python\nln.Run.get(id=ln.context.run.id).input_artifacts.all()\n```\n\n```python\nartifact.view_lineage()\n```\n\n```python\nassert len(ln.Run.get(id=ln.context.run.id).input_artifacts.all()) == 0\n```\n\n## Manually track artifact as run input\n\nLet us manually track an artifact by passing `is_run_input` to either `.cache()`, `.load()` or `.open()`:\n\n```python\nartifact.cache(is_run_input=True)\n```\n\nYou can see the fcs artifact is now being added to the run inputs:\n\n```python\nfor input in ln.Run.get(id=ln.context.run.id).input_artifacts.all():\n    print(input)\n```\n\n```python\nartifact.view_lineage()\n```\n\n```python\nassert len(ln.Run.get(id=ln.context.run.id).input_artifacts.all()) == 1\n```\n\n## Automatically track artifacts as run input\n\nIf you switch the following setting, and call to `.load()`, `.cache()` and `.open()` will track the artifact as run input.\n\n```python\nln.settings.track_run_inputs = True\n```\n\n```python\nartifact = ln.Artifact.get(description=\"My csv\")\n```\n\n```python\nartifact.load()\n```\n\n```python\nfor input in ln.Run.get(id=ln.context.run.id).input_artifacts.all():\n    print(input)\n```\n\n```python\nartifact.view_lineage()\n```\n\n```python\nassert len(ln.Run.get(id=ln.context.run.id).input_artifacts.all()) == 2\n```\n\n```python\n!lamin delete --force test-run-inputs\n```\n"
  },
  {
    "path": "docs/faq/trash-archive.md",
    "content": "# How do I trash or archive objects?\n\nAny object in LaminDB has the following 3 levels of visibility through 3 default branches:\n\n- `main`: visible\n- `archive`: excluded from query & search\n- `trash`: excluded from query & search, scheduled for deletion\n\nLet's look at an example for an `Artifact` object while noting that the same applies to any other `SQLRecord`.\n\n```python\nimport lamindb as ln\nimport pandas as pd\n\ndf = pd.DataFrame({\"a\": [1, 2], \"b\": [3, 4]})\nartifact = ln.Artifact.from_dataframe(df, key=\"dataset.parquet\").save()\n```\n\nAn artifact is by default created on the `main` branch.\n\n```python\nassert artifact.branch.name == \"main\"\nln.Artifact.filter(key=\"dataset.parquet\").to_dataframe()\n# the artifact shows up\n```\n\nIf you delete an artifact, it gets moved into the `trash` branch.\n\n```python\nartifact.delete()\nassert artifact.branch.name == \"trash\"\n```\n\nArtifacts in trash won't show up in queries with default arguments:\n\n```python\nln.Artifact.filter(key=\"dataset.parquet\").to_dataframe()\n# the artifact does not show up\n```\n\nYou can query for them by adding the `trash` branch to the filter.\n\n```python\nln.Artifact.filter(key=\"dataset.parquet\", branch__name=\"trash\").to_dataframe()\n# the artifact shows up\n```\n\nYou can restore an artifact from trash:\n\n```python\nartifact.restore()\nln.Artifact.filter(key=\"dataset.parquet\").to_dataframe()\n# the artifact shows up\n```\n"
  },
  {
    "path": "docs/faq/validate-fields.md",
    "content": "---\nexecute_via: python\n---\n\n# Django field validation\n\n[Django field validation](https://docs.djangoproject.com/en/5.1/ref/validators/) are enabled for models that inherit the `ValidateFields` class.\n\n```python\n# pip install lamindb\n!lamin init --storage ./test-django-validation\n```\n\n```python\nimport lamindb as ln\nfrom lamindb.core.exceptions import FieldValidationError\n```\n\n```python\ntry:\n    ln.Reference(name=\"my ref\", doi=\"abc.ef\", url=\"myurl.com\")\nexcept FieldValidationError as e:\n    print(e)\n```\n\n```python\n!lamin delete --force test-django-validation\n```\n"
  },
  {
    "path": "docs/faq.md",
    "content": "# FAQ\n\n```{toctree}\n:maxdepth: 1\n\nfaq/pydantic-pandera\nfaq/idempotency\nfaq/acid\nfaq/track-run-inputs\nfaq/curate-any\nfaq/import-modules\nfaq/reference-field\nfaq/trash-archive\nfaq/keep-artifacts-local\nfaq/validate-fields\nfaq/symbol-mapping\nfaq/search\n```\n"
  },
  {
    "path": "docs/guide.md",
    "content": "# Guide\n\n```{toctree}\n:hidden:\n:caption: \"Overview\"\n\nREADME\n```\n\n```{toctree}\n:hidden:\n:caption: \"How to\"\n\nquery-search\ntrack\norganize\nmanage-changes\nmanage-ontologies\nsync\n```\n\n```{toctree}\n:hidden:\n:caption: Use cases\n\nlightning\n```\n\n```{toctree}\n:hidden:\n:caption: Other topics\n\nfaq\nstorage\n```\n"
  },
  {
    "path": "docs/index.md",
    "content": "```{include} ../README.md\n:start-line: 0\n:end-line: 5\n```\n\n<meta http-equiv=\"Refresh\" content=\"0; url=./guide.html\" />\n\n```{toctree}\n:maxdepth: 1\n:hidden:\n\nguide\napi\nchangelog\n```\n"
  },
  {
    "path": "docs/lightning.md",
    "content": "# Lightning\n\nThis guide offers more context on the {class}`lamindb.integrations.lightning.Checkpoint` callback. For end-to-end examples, see the following guides:\n\n- {doc}`docs:clearml`\n- {doc}`docs:wandb`\n- {doc}`docs:mlflow`\n\n## Quickstart\n\nPass `ll.Checkpoint` and a logger into `Trainer`. The logger is what gives\ncheckpoints meaningful, namespaced artifact keys — without it, keys fall back\nto a bare `checkpoints/` prefix (or just the run UID when `ln.track()` is\nactive).\n\nAny logger implementing Lightning's `Logger` interface works (`TensorBoardLogger`,\n`WandbLogger`, `MLFlowLogger`, `CSVLogger`, etc.). We use `TensorBoardLogger`\nin the examples below.\n\n```python\nimport lamindb as ln\nimport lightning.pytorch as pl\nfrom lightning.pytorch.loggers import TensorBoardLogger\nfrom lamindb.integrations import lightning as ll\n\nln.track()\n\nlogger = TensorBoardLogger(save_dir=\"logs\")\ncheckpoint = ll.Checkpoint(monitor=\"val_loss\", mode=\"min\", save_top_k=3)\n\ntrainer = pl.Trainer(\n    max_epochs=10,\n    callbacks=[checkpoint],\n    logger=logger,\n)\ntrainer.fit(model, datamodule=datamodule)\n```\n\nAfter training, each saved checkpoint file is a LaminDB artifact:\n\n```python\ncheckpoint.last_checkpoint_artifact\ncheckpoint.last_checkpoint_artifact.key\n# e.g. \"logs/lightning_logs/2r5pIRnK7z0q/checkpoints/epoch=0-step=100.ckpt\"\n\ncheckpoint.checkpoint_key_prefix\n# e.g. \"logs/lightning_logs/2r5pIRnK7z0q/checkpoints\"\n```\n\n### How is a run organized?\n\nA Lightning `Trainer` coordinates three concerns during training:\n\n1. **Logger** — writes metrics (loss curves, learning rate, etc.) to a dashboard directory. The logger determines the local directory layout: `{save_dir}/{name}/{version}/`.\n2. **ModelCheckpoint** — saves model snapshots (`.ckpt` files) into a `checkpoints/` subdirectory underneath the logger's directory.\n3. **SaveConfigCallback** — when using `LightningCLI`, writes the fully resolved `config.yaml` into the logger's directory so you can reproduce exactly which hyperparameters were used.\n\nAll three share the same directory tree. The logger creates it, the checkpoint callback writes into it, and the config callback stores beside it:\n\n```\nlogs/                          # logger save_dir\n  lightning_logs/              # logger name\n    version_0/                 # logger version (local filesystem)\n      events.out.tfevents.*    # ← logger output (TensorBoard)\n      config.yaml              # ← SaveConfigCallback\n      checkpoints/\n        epoch=0-step=100.ckpt  # ← ModelCheckpoint\n        epoch=1-step=200.ckpt\n        hparams.yaml           # ← auto-generated by Lightning\n```\n\nLaminDB's integration replaces `ModelCheckpoint` with `ll.Checkpoint` and\nLightning's `SaveConfigCallback` with `ll.SaveConfigCallback`. Checkpoint\nfiles, the config, and `hparams.yaml` become `lamindb.Artifact` records with\nlineage tracking and optional feature annotations.\n\nNote that artifact keys in LaminDB do **not** mirror the local directory layout\nexactly — the callback uses the LaminDB run UID instead of Lightning's\nauto-incrementing `version_N` directory by default. See\n[How artifact keys are derived](#how-artifact-keys-are-derived) for details.\n\n### Which kind of artifacts?\n\n`Checkpoint` saves three kinds of artifacts:\n\n| Kind         | Example key                           | When                                     |\n| ------------ | ------------------------------------- | ---------------------------------------- |\n| `checkpoint` | `…/checkpoints/epoch=0-step=100.ckpt` | Every time Lightning writes a checkpoint |\n| `config`     | `…/config.yaml`                       | When using `ll.SaveConfigCallback`       |\n| `hparams`    | `…/checkpoints/hparams.yaml`          | When Lightning generates it              |\n\nCheckpoints and `hparams.yaml` live under the `checkpoints/` subdirectory,\nwhile the config sits directly under the base prefix.\n\nThe callback tracks the latest artifact of each kind:\n\n```python\ncheckpoint.last_checkpoint_artifact\ncheckpoint.last_config_artifact\ncheckpoint.last_hparams_artifact\ncheckpoint.last_artifact_event\n```\n\n### How is data lineage tracked?\n\nWhen a run is being tracked with `ln.track()`:\n\n- `checkpoint` artifacts are recorded as **run outputs** — they are produced by the training run.\n- `config` artifacts are recorded as **run inputs** — the resolved config is part of the run specification.\n- `hparams.yaml` is saved as an artifact but not linked as a run input.\n\n## How are artifact keys derived?\n\nLaminDB artifact keys are **not** necessarily a mirror of the local filesystem layout.\nLightning uses auto-incrementing version directories (`version_0`, `version_1`,\n…) on disk, but these are meaningless as artifact identifiers — they depend on\nwhat already exists locally and cannot reliably distinguish runs across\nmachines.\n\nInstead, when `ln.track()` is active, the callback uses the **LaminDB run UID**\nas the version segment by default (`run_uid_is_version=True`). This guarantees\nthat every tracked run produces unique artifact keys regardless of local state.\n\nThe base prefix is determined by priority:\n\n| Scenario                 | Base prefix                            |\n| ------------------------ | -------------------------------------- |\n| `dirpath` set (± logger) | `{dirpath}/{run_uid}`                  |\n| No `dirpath` + logger    | `{save_dir_basename}/{name}/{run_uid}` |\n| No `dirpath` + no logger | `{run_uid}`                            |\n\n`run_uid` above refers to the active LaminDB run UID (from `ln.context.run.uid`).\nWhen no run is tracked or `run_uid_is_version=False`, the callback falls back\nto the logger's own version (e.g. `version_0`) or omits the segment entirely.\n\n**Checkpoint & hparams keys:**\n\n| Scenario                      | LaminDB key pattern                                           |\n| ----------------------------- | ------------------------------------------------------------- |\n| Logger present (recommended)  | `{save_dir_basename}/{name}/{run_uid}/checkpoints/{filename}` |\n| No logger, explicit `dirpath` | `{dirpath}/{run_uid}/checkpoints/{filename}`                  |\n| No logger, no `dirpath`       | `{run_uid}/checkpoints/{filename}`                            |\n\n**Config keys:**\n\n| Scenario                      | Key pattern                                        |\n| ----------------------------- | -------------------------------------------------- |\n| Logger present                | `{save_dir_basename}/{name}/{run_uid}/config.yaml` |\n| No logger, explicit `dirpath` | `{dirpath}/{run_uid}/config.yaml`                  |\n| No logger, no `dirpath`       | `{run_uid}/config.yaml`                            |\n\nFor example, with `TensorBoardLogger(save_dir=\"logs\")` and a tracked run:\n\n```\nlogs/lightning_logs/2r5pIRnK7z0q/       # base prefix ({save_dir_basename}/{name}/{run_uid})\n  config.yaml                            # ← config artifact\n  checkpoints/\n    epoch=0-step=100.ckpt                # ← checkpoint artifact\n    hparams.yaml                         # ← hparams artifact\n```\n\n### Opting out of run UID keys\n\nPass `run_uid_is_version=False` to fall back to the logger-managed version\ndirectory, matching Lightning's local layout more closely:\n\n```python\ncheckpoint = ll.Checkpoint(\n    monitor=\"val_loss\",\n    run_uid_is_version=False,\n)\n```\n\nWith this setting, the key uses the logger's version (`version_0`, etc.)\ninstead of the run UID. This is mainly useful when you don't call `ln.track()`\nor when you want artifact keys that exactly mirror the local directory tree.\n\n### Why run UIDs instead of `version_N`?\n\nLightning's auto-incrementing `version_N` depends on what directories already\nexist at `save_dir`. Two runs on different machines — or the same machine after\nclearing `logs/` — can both produce `version_0`. With `run_uid_is_version=True`\n(the default), each tracked run gets a unique prefix derived from the Lamin\nrun, so artifact keys never collide.\n\n## Use with the Lightning CLI\n\nThe Lightning CLI resolves a YAML config into concrete model and data module\narguments. To also store that resolved config as a LaminDB artifact, pass\n`ll.SaveConfigCallback` in your training script and declare the trainer,\nlogger, callbacks, model, and data in a config file.\n\n**`config.yaml`**\n\n```yaml\ntrainer:\n  max_epochs: 10\n\n  logger:\n    class_path: lightning.pytorch.loggers.TensorBoardLogger\n    init_args:\n      save_dir: logs\n\n  callbacks:\n    - class_path: lamindb.integrations.lightning.Checkpoint\n      init_args:\n        monitor: val/loss\n        mode: min\n        save_top_k: 3\n\nmodel:\n  learning_rate: 1.0e-3\n\ndata:\n  batch_size: 64\n```\n\n**`train.py`**\n\n```python\nimport lamindb as ln\nfrom lightning.pytorch.cli import LightningCLI\nfrom lamindb.integrations.lightning import SaveConfigCallback\n\nln.track()\n\ndef cli_main() -> None:\n    LightningCLI(\n        model_class=MyModel,\n        datamodule_class=MyDataModule,\n        save_config_callback=SaveConfigCallback,\n    )\n\nif __name__ == \"__main__\":\n    cli_main()\n```\n\n```bash\npython train.py fit --config config.yaml\n```\n\n`ll.SaveConfigCallback` extends Lightning's built-in version: it writes the\nlocal file as usual and then delegates to whichever\n`ArtifactPublishingModelCheckpoint` is registered on the trainer to persist the\nconfig as an artifact.\n\n## Annotating with features\n\nAttach custom run-level and artifact-level feature values through `features=`:\n\n```python\nlogger = TensorBoardLogger(save_dir=\"logs\")\ncheckpoint = ll.Checkpoint(\n    monitor=\"val_loss\",\n    features={\n        \"run\": {\"training_framework\": \"lightning\"},\n        \"artifact\": {\"dataset_version\": \"2026-03\"},\n    },\n)\n\ntrainer = pl.Trainer(callbacks=[checkpoint], logger=logger)\n```\n\nFeature names must already exist in Lamin.\n\nThe callback can also auto-track standard Lightning fields. Create the\ncorresponding LaminDB features once:\n\n```python\nll.save_lightning_features()\n```\n\nThis enables auto-features:\n\n- Artifact-level: `is_best_model`, `is_last_model`, `score`, `model_rank`, `save_weights_only`, `monitor`, `mode`\n- Run-level: `logger_name`, `logger_version`, `max_epochs`, `max_steps`,\n  `precision`, `accumulate_grad_batches`, `gradient_clip_val`, `monitor`, `mode`\n\n## Extending the callback\n\n### Subclass `Checkpoint`\n\nSubclass when you want to keep LaminDB persistence and additionally notify an\nexternal system after each artifact is saved:\n\n```python\nfrom lamindb.integrations import lightning as ll\nfrom my_model_registry import ModelRegistry\n\n\nclass ModelRegistryCheckpoint(ll.Checkpoint):\n    \"\"\"Register each checkpoint in an external model registry.\"\"\"\n\n    def __init__(self, *args, registry_project: str, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.registry_project = registry_project\n        self.model_registry = ModelRegistry()\n\n    def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None:\n        if event.kind == \"checkpoint\":\n            # register the model in your external system\n            self.model_registry.register(\n                project=self.registry_project,\n                model_uri=event.storage_uri,\n                metadata={\"lamin_key\": event.key},\n            )\n\n\nlogger = TensorBoardLogger(save_dir=\"logs\")\ncheckpoint = ModelRegistryCheckpoint(\n    registry_project=\"my-project\",\n    monitor=\"val_loss\",\n    save_top_k=3,\n)\ntrainer = pl.Trainer(callbacks=[checkpoint], logger=logger)\ntrainer.fit(model, datamodule=datamodule)\n```\n\nEach event gives you:\n\n- `event.kind`: `\"checkpoint\"`, `\"config\"`, or `\"hparams\"`\n- `event.artifact`: the persisted LaminDB artifact\n- `event.key`: the LaminDB artifact key\n- `event.local_path`: the local file path Lightning wrote\n- `event.storage_uri`: the stable storage URI for downstream systems\n\n### Attach an observer\n\nObservers are useful when you want composition instead of inheritance:\n\n```python\nfrom lamindb.integrations import lightning as ll\n\n\nclass ArtifactLogger:\n    def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None:\n        print(event.kind, event.storage_uri)\n\n    def on_artifact_removed(self, event: ll.ArtifactRemovedEvent) -> None:\n        print(\"removed\", event.key)\n\n\nlogger = TensorBoardLogger(save_dir=\"logs\")\ncheckpoint = ll.Checkpoint(\n    monitor=\"val_loss\",\n    artifact_observers=[ArtifactLogger()],\n)\n\ntrainer = pl.Trainer(callbacks=[checkpoint], logger=logger)\ntrainer.fit(model, datamodule=datamodule)\n```\n\nObservers receive the same events that subclasses see.\n\n## Integrating other systems\n\nTo register checkpoints in another system (e.g. ClearML, Weights & Biases,\nMLflow, Neptune, or Comet), use the artifact lifecycle events rather than\nre-deriving paths from Lightning internals.\n\nThe key hand-off value is `event.storage_uri`, which resolves to the persisted\nartifact location. `event.artifact` gives you the full LaminDB record when you\nneed metadata beyond the URI.\n"
  },
  {
    "path": "docs/manage-changes.md",
    "content": "# Manage changes\n\nManaging changes in LaminDB is largely analogous to managing code changes via branching in git and Pull Requests in GitHub.\n\nFor usage examples, read the `Examples` section of the {class}`~lamindb.Branch` class.\n"
  },
  {
    "path": "docs/manage-ontologies.md",
    "content": "---\nexecute_via: python\n---\n\n# Manage biological ontologies\n\nThis guide shows how to manage ontologies for basic biological entities.\n\n```{raw} html\n<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/3vpWjHj3Kw8?si=D0jxqL2zB4idh2QA\" title=\"YouTube video player\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen></iframe>\n```\n\nIf instead you're interested in\n\n- accessing public ontologies, see {doc}`docs:public-ontologies`\n- flexible bio registries for the wetlab (a LIMS), see {class}`~lamindb.Record` and {doc}`docs:records`\n\n```python\n# pip install lamindb\n!lamin init --storage ./test-ontologies --modules bionty\n```\n\n## Import records from public ontologies\n\nLet's first populate our {class}`~bionty.CellType` registry with the default public ontology (Cell Ontology).\n\n```python\nimport lamindb as ln\nimport bionty as bt\n\n# inspect the available public ontology versions\nbt.Source.to_dataframe()\n```\n\n```python\n# inspect which ontology version we're about to import\nbt.Source.get(entity=\"bionty.CellType\", currently_used=True)\n```\n\n```python\n# populate the database with a public ontology\nbt.CellType.import_source()\n```\n\nThis is now your in-house cell type ontology in which you can add & modify records as you like. It's a registry just like `Artifact` or `Record`.\n\n```python\n# all public cell types are now available in LaminDB\nbt.CellType.to_dataframe()\n```\n\n```python\n# let's also populate the Gene registry with human and mouse genes\nbt.Gene.import_source(organism=\"human\")\nbt.Gene.import_source(organism=\"mouse\")\n```\n\n## Access records in in-house registries\n\nSearch key words:\n\n```python\nbt.CellType.search(\"gamma-delta T\").to_dataframe().head(2)\n```\n\nOr look up with auto-complete:\n\n```python\ncell_types = bt.CellType.lookup()\nhsc_record = cell_types.hematopoietic_stem_cell\nhsc_record\n```\n\nFilter by fields and relationships:\n\n```python\ngdt_cell = bt.CellType.get(ontology_id=\"CL:0000798\", created_by__handle=\"testuser1\")\ngdt_cell\n```\n\nView the ontological hierarchy:\n\n```python\ngdt_cell.view_parents()  # pass with_children=True to also view children\n```\n\nOr access the parents and children directly:\n\n```python\ngdt_cell.parents.to_dataframe()\n```\n\n```python\ngdt_cell.children.to_dataframe()\n```\n\nIt is also possible to recursively query parents or children, getting direct parents (children), their parents, and so forth.\n\n```python\ngdt_cell.query_parents().to_dataframe()\n```\n\n```python\ngdt_cell.query_children().to_dataframe()\n```\n\n## Construct custom hierarchies of records\n\nYou can add a child of a parent record:\n\n```python\n# register a new cell type\nmy_celltype = bt.CellType(name=\"my new T-cell subtype\").save()\n# specify \"gamma-delta T cell\" as a parent\nmy_celltype.parents.add(gdt_cell)\n\n# visualize hierarchy\nmy_celltype.view_parents(distance=3)\n```\n\n## Create new records\n\nWhen accessing datasets, one often encounters bulk references to entities that might be corrupted or standardized using different standardization schemes.\n\nLet's consider an example based on an `AnnData` object, in the `cell_type` annotations of this `AnnData` object, we find 4 references to cell types:\n\n```python\nadata = ln.examples.datasets.anndata_with_obs()\nadata.obs.cell_type.value_counts()\n```\n\nWe'd like to load the corresponding records in our in-house registry to annotate a dataset.\n\nTo this end, you'll typically use {class}`~lamindb.models.CanCurate.from_values`, which will both validate & retrieve records that match the values.\n\n```python\ncell_types = bt.CellType.from_values(adata.obs.cell_type)\ncell_types\n```\n\nLogging informed us that 3 cell types were validated. Since we loaded these records at the same time, we could readily use them to annotate a dataset.\n\n:::{dropdown} What happened under-the-hood?\n\n`.from_values()` performs the following look ups:\n\n1. If registry records match the values, load these records\n2. If values match synonyms of registry records, load these records\n3. If no record in the registry matches, attempt to load records from a public ontology\n4. Same as 3. but based on synonyms\n\nNo records will be returned if all 4 look ups are unsuccessful.\n\nSometimes, it's useful to treat validated records differently from non-validated records. Here is a way:\n\n```\noriginal_values = [\"gut\", \"gut2\"]\ninspector = bt.Tissue.inspect(original_values)\nrecords_from_validated_values = bt.Tissue.from_values(inspector.validated)\n```\n\n:::\n\nAlternatively, we can retrieve records based on ontology ids:\n\n```python\nadata.obs.cell_type_id.unique().tolist()\n```\n\n```python\nbt.CellType.from_values(adata.obs.cell_type_id, field=bt.CellType.ontology_id)\n```\n\n## Validate & standardize\n\nSimple validation of an iterable of values works like so:\n\n```python\nbt.CellType.validate([\"fat cell\", \"blood forming stem cell\"])\n```\n\nBecause these values don't comply with the registry, they're not validated!\n\nYou can easily convert these values to validated standardized names based on synonyms like so:\n\n```python\nbt.CellType.standardize([\"fat cell\", \"blood forming stem cell\"])\n```\n\nAlternatively, you can use `.from_values()`, which will only ever return validated records and automatically standardize under-the-hood:\n\n```python\nbt.CellType.from_values([\"fat cell\", \"blood forming stem cell\"])\n```\n\nIf you are now sure what to do, use `.inspect()` to get instructions:\n\n```python\nbt.CellType.inspect([\"fat cell\", \"blood forming stem cell\"]);\n```\n\nWe can also add new synonyms to a record:\n\n```python\nhsc_record.add_synonym(\"HSC\")\n```\n\nAnd when we encounter this synonym as a value, it will now be standardized using synonyms-lookup, and mapped on the correct registry record:\n\n```python\nbt.CellType.standardize([\"HSC\"])\n```\n\nA special synonym is `.abbr` (short for abbreviation), which has its own field and can be assigned via:\n\n```python\nhsc_record.set_abbr(\"HSC\")\n```\n\nYou can create a lookup object from the `.abbr` field:\n\n```python\ncell_types = bt.CellType.lookup(\"abbr\")\ncell_types.hsc\n```\n\nThe same workflow works for all of `bionty`'s registries.\n\n## Manage ontologies across organisms\n\nSeveral registries are organism-aware (has a `.organism` field), for instance, {class}`~bionty.Gene`.\n\nIn this case, API calls that interact with multi-organism registries require an `organism` argument when there's ambiguity.\n\nFor instance, when validating gene symbols:\n\n```python\nbt.Gene.validate([\"TCF7\", \"ABC1\"], organism=\"human\")\n```\n\nIn contrary, working with Ensembl Gene IDs doesn't require passing `organism`, as there's no ambiguity:\n\n```python\nbt.Gene.validate(\n    [\"ENSG00000000419\", \"ENSMUSG00002076988\"], field=bt.Gene.ensembl_gene_id\n)\n```\n\nWhen working with the same organism throughout your analysis/workflow, you can omit the `organism` argument by configuring it globally:\n\n```python\nbt.settings.organism = \"mouse\"\nbt.Gene.from_source(symbol=\"Ap5b1\")\n```\n\n## Track ontology versions\n\nUnder-the-hood, source ontology versions are automatically tracked for each registry:\n\n```python\nbt.Source.filter(currently_used=True).to_dataframe()\n```\n\nEach record is linked to a versioned public source (if it was created from public):\n\n```python\nhepatocyte = bt.CellType.get(name=\"hepatocyte\")\nhepatocyte.source\n```\n\n## Create records from a specific ontology version\n\nBy default, new records are imported or created from the `\"currently_used\"` public sources which are configured during the instance initialization, e.g.:\n\n```python\nbt.Source.filter(entity=\"bionty.Phenotype\", currently_used=True).to_dataframe()\n```\n\nSometimes, the default source doesn't contain the ontology term you are looking for.\n\nYou can then specify to create a record from a non-default source. For instance, we can use the `ncbitaxon` ontology:\n\n```python\nsource = bt.Source.get(entity=\"bionty.Organism\", name=\"ncbitaxon\")\nsource\n```\n\n```python\n# validate against the NCBI Taxonomy\nbt.Organism.validate(\n    [\"iris setosa\", \"iris versicolor\", \"iris virginica\"], source=source\n)\n```\n\n```python\n# since we didn't seed the Organism registry with the NCBITaxon public ontology\n# we need to save the records to the database\nrecords = bt.Organism.from_values(\n    [\"iris setosa\", \"iris versicolor\", \"iris virginica\"], source=source\n).save()\n\n# now we can query a iris organism and view its parents and children\nbt.Organism.get(name=\"iris\").view_parents(with_children=True)\n```\n\n<!-- #region -->\n\n## Access any Ensembl genes\n\nGenes from all Ensembl versions and organisms can be accessed, even though they are not yet present in the `bt.Source` registry.\n\nFor instance, if you want to use `rabbit` genes from Ensembl version `release-103`:\n\n```python\n\n# pip install pymysql\nimport bionty as bt\n\n# automatically download genes for a new organism\ngene_ontology = bt.base.Gene(source=\"ensembl\", organism=\"rabbit\", version='release-103')\n\n# register the new source in lamindb\ngene_ontology.register_source_in_lamindb()\n\n# now you can start using this source\n\n# import all genes from this source to your Gene registry\nsource = bt.Source.get(entity=\"bionty.Gene\", name=\"ensembl\", organism=\"rabbit\", version=\"release-103\")\nbt.Gene.import_source(source=source)\n```\n\n<!-- #endregion -->\n"
  },
  {
    "path": "docs/organize.md",
    "content": "# Organize datasets\n\n```{toctree}\n:maxdepth: 1\n:hidden:\n\ncurate\n```\n\nThis guide walks through organizing datasets using files & folders, database relationships, and versioned collections.\n\n## Via files & folders\n\nYou can use LaminDB like a file system. Similar to AWS S3, you organize artifacts into virtual folders using `/`-separated keys. To ingest a single file into a `project1/` folder, you'd call:\n\n```python\nartifact1 = ln.Artifact(\"./dataset.csv\", key=\"project1/dataset1.csv\").save()\n```\n\nFor convenience, if you want to create an artifact for every file in a directory, use {meth}`~lamindb.Artifact.from_dir`:\n\n```python\nartifacts = ln.Artifact.from_dir(\"./project1/\").save()\n```\n\nYou can then query for all artifacts in the `\"./project1/\"` folder via:\n\n```python\nartifacts = ln.Artifact.filter(key__startswith=\"project1/\")\n```\n\nUnlike a regular file system, every artifact is versioned and comes with rich metadata.\n\n:::{dropdown} What if I do not care about the metadata and version of every file in a folder?\n\nIn some cases a folder _is_ the dataset and you don't need fine-grained information for every file.\nIn this scenario, save the entire directory as a single artifact:\n\n```python\nln.Artifact(\"./folder_abc\", key=\"folder_abc\").save()\n```\n\n:::\n\n## Via relationships in the database\n\n### Annotating with projects\n\nWhat if an artifact is relevant to multiple projects?\nA dataset that's in the `project1/` folder cannot **also** reside in a `project2/` folder.\nYou can solve this problem with the `artifact.projects` relationship that links the {class}`~lamindb.Project` to {class}`~lamindb.Artifact`:\n\n<img width=\"400\" alt=\"image\" src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/uVm5ptyqukPEKCix0000.png\"/>\n\nHere is how to annotate one artifact with two projects:\n\n```python\nproject1 = ln.Project(name=\"Project 1\").save()  # create project 1\nproject2 = ln.Project(name=\"Project 2\").save()  # create project 2\nartifact1.projects.add(project1, project2)      # annotate artifact1\n```\n\nThis allows you to retrieve `artifact1` by querying any project it belongs to:\n\n```python\nartifacts_in_project1 = ln.Artifact.filter(projects=project1)\nartifacts_in_project2 = ln.Artifact.filter(projects=project2)\n```\n\nHere, `artifact1` is part of both query results.\n\n:::{dropdown} Three additional advantages to using related registries rather than folder structures.\n\n1. Projects can be richly annotated (e.g., with start/end dates, parent projects, or member roles).\n2. You no longer need to rely on fragile file paths. If a folder is renamed, path-based retrieval breaks, but a project query by `uid` will always work.[^protectproject]\n3. You can run a constrained query or search against all projects in your database rather than trying to narrow a search to folder names.\n\n:::\n\n### Annotating with labels\n\nYou can annotate with other entity types, not just projects. LaminDB offers two main classes for this: {class}`~lamindb.Record` for metadata records and {class}`~lamindb.ULabel` for simple labels, which are both link to artifacts:\n\n<img width=\"400\" alt=\"image\" src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/qvhxt6UuoUO2Bd820000.png\"/>\n\nHere is how to annotate with a ulabel and with a sample record:\n\n```python\nulabel1 = ln.ULabel(name=\"raw_data\").save()  # create a ulabel\nartifact1.ulabels.add(ulabel1)               # annotate artifact1\n\nsample_type = ln.Record(                     # create a record type \"Samples\"\n    name=\"Samples\",\n    is_type=True\n).save()\nrecord1 = ln.Record(                         # create a sample record\n    name=\"My sample\",\n    features={\"gc_content\": 0.5}\n).save()\nartifact1.records.add(record1)               # annnotate artifact1\n```\n\nYou can use records and ulabels alongside entity types in modules such as {mod}`bionty`:\n\n```python\nimport bionty as bt\n\ncell_type1 = bt.CellType.from_source(\n    name=\"T cell\"                            # create a cell type from a public ontology\n).save()\nartifact1.cell_types.add(cell_type1)         # annotate artifact1\n```\n\n### Annotating with features\n\nTo annotate with non-categorical data types or to disambiguate categorical annotations, use {class}`~lamindb.Feature` objects.\n\n<img width=\"400\" alt=\"image\" src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/eT6SEny5HpQQNgFl0000.png\"/>\n\nHere is how to define features and annotate an artifact with feature values:\n\n```python\nexp_type = ln.Record.get(name=\"Experiments\")          # query the entity type `Experiments`\nln.Feature(name=\"gc_content\", dtype=float).save()     # define a feature with dtype float\nln.Feature(name=\"experiment\", dtype=exp_type).save()  # define a feature with dtype `Experiments`\nartifact.features.set_values({\n    \"gc_content\": 0.55,                               # validated to be a float\n    \"experiment\": \"Experiment 1\",                     # validated to exist under the `Experiments` record type\n})\n```\n\nWhen you work with structured data formats like `DataFrame` or `AnnData`, it often makes sense to validate the content of their features. After validation, the parsed feature values are automatically used for annotation. The easiest way is to use validation and auto-annotation is the built-in schema `\"valid_features\"`:\n\n```python\n# validate columns in the dataframe and map them on features\n# auto-annotate with parsed metadata\nln.Artifact.from_dataframe(df, schema=\"valid_features\").save()\n```\n\nBelow is an example from the {doc}`docs:tutorial` illustrating how you get e.g. cell type, treatment, and assay annotations based on a dataframe's content. You can read more on this in {doc}`/curate`.\n\n<img width=\"600px\" src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/6sofuDVvTANB0f480003.png\">\n\n### Annotating with data-lineage\n\nWhen you call {func}`~lamindb.track` or decorate a function with {func}`~lamindb.flow`, you automatically annotate artifacts with {class}`~lamindb.Run` and {class}`~lamindb.Transform` objects.\n\n<img width=\"400\" alt=\"image\" src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/Z1iliqp5mInQQ2iY0000.png\"/>\n\nHere is how:\n\n```{eval-rst}\n.. literalinclude:: scripts/run_track_and_finish.py\n   :language: python\n```\n\nNote that you can pass `project` to {func}`~lamindb.track` to auto-annotate all objects that are created in a run with a project label. Read more in {doc}`/track`.\n\n### Overview of auto-generated annotations\n\nThe {class}`~lamindb.Artifact` registry has simple fields (such as `description`, `created_at`, `size`) and related fields (such as `projects`, `created_by`, `storage`). Many of these fields are automatically populated and you can use them to retrieve sets of artifacts.\n\n<img width=\"800px\" src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/HMfWLa1rFkxcxQEN0000.svg\">\n\nAll other registries link to {class}`~lamindb.Artifact` to provide context for finding, querying, validating, and managing artifacts.[^starsnowflake]\n\n:::{dropdown} Can you give me some example queries?\n\nHere are examples leveraging auto-populated fields.\n\n```python\nartifacts = ln.Artifact.filter(\n    created_at__gt=\"2023-06-24\",    # created after June 24th, 2023\n    size__lt=1e9,                   # smaller than 1GB\n    suffix=\".parquet\",              # with a .parquet suffix\n    n_observations__gt=1000,        # with more than 1000 observations\n    n_files__gt=1000,               # folder-like artifacts with more than 1000 files\n    otype=\"DataFrame\",              # that are DataFrames\n    created_on__name=\"my-branch\",   # created on a specific branch or environment\n    created_by__handle=\"falexwolf\", # created by user with handle falexwolf\n    run=run,                        # created by a specific run\n    transform__name=\"my-script.py\", # created by a specific script/notebook\n)\n```\n\n:::\n\n## Versioned collections of artifacts\n\nIf you want to group artifacts by metadata and version the entire set, use {class}`~lamindb.Collection`.\n\n<img width=\"160\" alt=\"image\" src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/QR0KuktVEnVL08K90000.png\"/>\n\nUnlike during annotation, you have to pass an entire group of artifacts to a `Collection` constructor:\n\n```python\ncollection = ln.Collection([artifact1, artifact2], key=\"my_data_release\").save()\n```\n\nAnd unlike the folder-based or annotation-based sets of artifacts — which can change as artifacts are added or removed — a collection guarantees an exact, immutable set of artifacts.\n\nArtifacts are versioned based on the hash of their content. Collections are versioned based on the top-level hash of their artifact hashes. If you use the {meth}`~lamindb.Collection.append` method, a new version of the collection is created, and the old version is left unchanged:\n\n```python\ncollection_v2 = collection.append(artifact3)\n```\n\nWhile collections are indirectly annotated through the annotations of the artifacts they contain, you can also add collection-level annotations. Like artifacts, collections link to projects, runs, ulabels, records, and most other registries.\n\n[^starsnowflake]: You can consider the SQL table underlying {class}`~lamindb.Artifact` your _fact table_ and all other tables for other entities your _dimension tables_ in a star or Snowflake schema ([see Wikipedia](https://en.wikipedia.org/wiki/Fact_table)).\n\n[^protectproject]: The project annotation of the artifact is protected against the deletion of the project. If a user with necessary rights attempts to delete the project, they will get an error.\n"
  },
  {
    "path": "docs/pertdb.md",
    "content": "# `pertdb`\n\n```{eval-rst}\n.. automodule:: pertdb\n```\n"
  },
  {
    "path": "docs/query-search.md",
    "content": "# Query, search & stream\n\n```{toctree}\n:maxdepth: 1\n\nregistries\narrays\n```\n"
  },
  {
    "path": "docs/registries.md",
    "content": "---\nexecute_via: python\n---\n\n# Query & search registries\n\nThis guide walks through different ways of querying & searching LaminDB registries.\nTo understand the underlying cross-linking of objects in the SQL database, read {doc}`organize`.\n\nIf you already have a set of artifacts, e.g. in the form of parquet files, and you'd like to now query/stream their (validated) content, read {doc}`arrays`.\n\n```python\n# initialize a test database to run examples\n!lamin init --storage ./test-registries --modules bionty\n```\n\nLet's start by creating a few exemplary datasets:\n\n```python\nimport lamindb as ln\n\nln.Artifact(ln.examples.datasets.file_fastq(), key=\"raw/my_fastq.fastq.gz\").save()\nln.Artifact(ln.examples.datasets.file_jpg_paradisi05(), key=\"my_image.jpg\").save()\nln.Artifact.from_dataframe(ln.examples.datasets.df_iris(), key=\"iris.parquet\").save()\nln.examples.datasets.mini_immuno.save_mini_immuno_datasets()\n```\n\n## Get an overview\n\nThe easiest way to get an overview over all artifacts is by typing {meth}`~lamindb.Artifact.to_dataframe`, which returns the 100 latest artifacts in the {class}`~lamindb.Artifact` registry.\n\n```python\nln.Artifact.to_dataframe()\n```\n\nYou can include features.\n\n```python\nln.Artifact.to_dataframe(include=\"features\")\n```\n\nYou can include fields from other registries.\n\n```python\nln.Artifact.to_dataframe(\n    include=[\n        \"created_by__name\",\n        \"records__name\",\n        \"cell_types__name\",\n        \"schemas__itype\",\n    ]\n)\n```\n\nYou can also get an overview of the entire database.\n\n```python\nln.view()\n```\n\n## Auto-complete objects\n\nFor registries with less than 100k objects, auto-completing a `Lookup` object is the most convenient way of finding a record.\n\n```python\nrecords = ln.Record.lookup()\n```\n\nWith auto-complete, we find a record:\n\n```python\nexperiment_1 = records.experiment_1\nexperiment_1\n```\n\nThis works for any {class}`~lamindb.models.BaseSQLRecord` class, e.g., also for plugin `bionty`.\n\n```python\nimport bionty as bt\n\ncell_types = bt.CellType.lookup()\n```\n\n## Get one object\n\n{meth}`~lamindb.models.BaseSQLRecord.get` errors if none or more than one matching objects are found.\n\n```python\nln.Record.get(experiment_1.uid)  # by uid\nln.Record.get(name=\"Experiment 1\")  # by field\n```\n\n## Query objects by fields\n\nUse {meth}`~lamindb.models.BaseSQLRecord.filter` to query all artifacts by the `suffix` field:\n\n```python\nqs = ln.Artifact.filter(suffix=\".h5ad\")\nqs\n```\n\nThis returns a {class}`~lamindb.models.QuerySet`, which lazily references the set of {class}`~lamindb.models.BaseSQLRecord` objects that matches the filter statement. You can iteratively filter a queryset:\n\n```python\nqs = qs.filter(records__name=\"Experiment 1\")\n```\n\nTo access the results encoded in a queryset, call:\n\n- {meth}`~lamindb.models.BasicQuerySet.to_dataframe`: A pandas `DataFrame` with each record in a row.\n- {meth}`~lamindb.models.BasicQuerySet.one`: Exactly one record. Will raise an error if there is none. Is equivalent to the `.get()` method shown above.\n- {meth}`~lamindb.models.BasicQuerySet.one_or_none`: Either one record or `None` if there is no query result.\n\nAlternatively,\n\n- use the `QuerySet` as an iterator\n- get individual objects via `qs[0]`, `qs[1]`\n\nFor example:\n\n```python\nqs.to_dataframe()\n```\n\nNote that the `SQLRecord` classes in LaminDB are Django Models and any [Django query](https://docs.djangoproject.com/en/stable/topics/db/queries/) works.\n\n## Query objects by features\n\nThe `Artifact`, `Record`, and `Run` registries can be queried by features.\n\n```python\nln.Artifact.filter(perturbation=\"DMSO\").to_dataframe(include=\"features\")\n```\n\nYou can also query by passing a `Feature` object, which is useful to disambiguate feature names.\n\n```python\nperturbation = ln.Feature.get(name=\"perturbation\")  # can optionally pass a feature type to disambiguate\nln.Artifact.filter(perturbation == \"DMSO\")  # note this is now an expression using the == syntax\n```\n\nJust like for fields holding dictionary values, you can query for dictionary keys in features whose `dtype` is `dict`:\n\n```python\nln.Artifact.filter(study_metadata__detail1=\"123\").to_dataframe(include=\"features\")\n```\n\n```python\nln.Artifact.filter(study_metadata__detail2=2).to_dataframe(include=\"features\")\n```\n\nYou can query for whether a dataset is annotated or not annotated by a feature.\n\n```python\nln.Artifact.filter(perturbation__isnull=True).to_dataframe(include=\"features\")\n```\n\n```python\nln.Artifact.filter(perturbation__isnull=False).to_dataframe(include=\"features\")\n```\n\n## Query runs by parameters\n\nHere is an example for querying by parameters: {ref}`track-run-parameters`.\n\n## Search for objects\n\nYou can search every registry via {meth}`~lamindb.models.SQLRecord.search`. For example, the `Artifact` registry.\n\n```python\nln.Artifact.search(\"iris\").to_dataframe()\n```\n\nHere is more background on search and examples for searching the entire cell type ontology: {doc}`/faq/search`\n\n## Query related registries\n\nDjango has a double-under-score syntax to filter based on related tables.\n\nThis syntax enables you to traverse several layers of relations and leverage different comparators.\n\n```python\nln.Artifact.filter(created_by__handle__startswith=\"testuse\").to_dataframe()\n```\n\nThe filter selects all artifacts based on the users who ran the generating notebook. Under the hood, in the SQL database, it's joining the artifact table with the user table.\n\nAnother typical example is querying all datasets that measure a particular feature. For instance, which datasets measure `\"CD8A\"`. Here is how to do it:\n\n```python\ncd8a = bt.Gene.get(symbol=\"CD8A\")\n# query for all feature sets that contain CD8A\nschemas_with_cd8a = ln.Schema.filter(genes=cd8a)\n# get all artifacts\nln.Artifact.filter(schemas__in=schemas_with_cd8a).to_dataframe()\n```\n\nInstead of splitting this across three queries, the double-underscore syntax allows you to define a path for one query.\n\n```python\nln.Artifact.filter(schemas__genes__symbol=\"CD8A\").to_dataframe()\n```\n\n## Filter operators\n\nYou can qualify the type of comparison in a query by using a comparator.\n\nBelow follows a list of the most import, but Django supports about [two dozen field comparators](https://docs.djangoproject.com/en/stable/ref/models/querysets/#field-lookups) `field__comparator=value`.\n\n### and\n\n```python\nln.Artifact.filter(suffix=\".h5ad\", records=experiment_1).to_dataframe()\n```\n\n### less than/ greater than\n\nOr subset to artifacts greater than 10kB. Here, we can't use keyword arguments, but need an explicit where statement.\n\n```python\nln.Artifact.filter(records=experiment_1, size__gt=1e4).to_dataframe()\n```\n\n### in\n\n```python\nln.Artifact.filter(suffix__in=[\".jpg\", \".fastq.gz\"]).to_dataframe()\n```\n\n### order by\n\n```python\nln.Artifact.filter().order_by(\"created_at\").to_dataframe()\n```\n\n```python\n# reverse ordering\nln.Artifact.filter().order_by(\"-created_at\").to_dataframe()\n```\n\n```python\nln.Artifact.filter().order_by(\"key\").to_dataframe()\n```\n\n```python\n# reverse ordering\nln.Artifact.filter().order_by(\"-key\").to_dataframe()\n```\n\n### contains\n\n```python\nln.Transform.filter(description__contains=\"search\").to_dataframe().head(5)\n```\n\nAnd case-insensitive:\n\n```python\nln.Transform.filter(description__icontains=\"Search\").to_dataframe().head(5)\n```\n\n### startswith\n\n```python\nln.Transform.filter(description__startswith=\"Query\").to_dataframe()\n```\n\n### or\n\n```python\nln.Artifact.filter(ln.Q(suffix=\".jpg\") | ln.Q(suffix=\".fastq.gz\")).to_dataframe()\n```\n\n### negate/ unequal\n\n```python\nln.Artifact.filter(~ln.Q(suffix=\".jpg\")).to_dataframe()\n```\n"
  },
  {
    "path": "docs/scripts/curate_anndata_flexible.py",
    "content": "import lamindb as ln\n\nln.examples.datasets.mini_immuno.define_features_labels()\nadata = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\nartifact = ln.Artifact.from_anndata(\n    adata,\n    key=\"examples/mini_immuno.h5ad\",\n    schema=\"ensembl_gene_ids_and_valid_features_in_obs\",\n).save()\nartifact.describe()\n"
  },
  {
    "path": "docs/scripts/curate_anndata_uns.py",
    "content": "import lamindb as ln\n\nln.examples.datasets.mini_immuno.define_features_labels()\nadata = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\nschema = ln.Schema.get(name=\"Study metadata schema\")\nartifact = ln.Artifact.from_anndata(\n    adata, schema=schema, key=\"examples/mini_immuno_uns.h5ad\"\n)\nartifact.describe()\n"
  },
  {
    "path": "docs/scripts/curate_dataframe_attrs.py",
    "content": "import lamindb as ln\n\nfrom .define_schema_df_metadata import study_metadata_schema\n\ndf = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"DataFrame\")\nschema = ln.Schema(\n    features=[ln.Feature(name=\"perturbation\", dtype=\"str\").save()],\n    slots={\"attrs\": study_metadata_schema},\n    otype=\"DataFrame\",\n).save()\ncurator = ln.curators.DataFrameCurator(df, schema=schema)\ncurator.validate()\nartifact = curator.save_artifact(key=\"examples/df_with_attrs.parquet\")\nartifact.describe()\n"
  },
  {
    "path": "docs/scripts/curate_dataframe_external_features.py",
    "content": "import lamindb as ln\nfrom datetime import date\n\ndf = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"DataFrame\")\n\ntemperature = ln.Feature(name=\"temperature\", dtype=float).save()\ndate_of_study = ln.Feature(name=\"date_of_study\", dtype=date).save()\nexternal_schema = ln.Schema(features=[temperature, date_of_study]).save()\n\nconcentration = ln.Feature(name=\"concentration\", dtype=str).save()\ndonor = ln.Feature(name=\"donor\", dtype=str, nullable=True).save()\nschema = ln.Schema(\n    features=[concentration, donor],\n    slots={\"__external__\": external_schema},\n    otype=\"DataFrame\",\n).save()\n\nartifact = ln.Artifact.from_dataframe(\n    df,\n    key=\"examples/dataset1.parquet\",\n    features={\"temperature\": 21.6, \"date_of_study\": date(2024, 10, 1)},\n    schema=schema,\n).save()\nartifact.describe()\n"
  },
  {
    "path": "docs/scripts/curate_dataframe_flexible.py",
    "content": "import lamindb as ln\n\nln.examples.datasets.mini_immuno.define_features_labels()\ndf = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"DataFrame\")\nartifact = ln.Artifact.from_dataframe(\n    df, key=\"examples/dataset1.parquet\", schema=\"valid_features\"\n).save()\nartifact.describe()\n"
  },
  {
    "path": "docs/scripts/curate_dataframe_minimal_errors.py",
    "content": "import lamindb as ln\n\nschema = ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible()\ndf = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"DataFrame\")\ndf.pop(\"donor\")  # remove donor column to trigger validation error\ntry:\n    artifact = ln.Artifact.from_dataframe(\n        df, key=\"examples/dataset1.parquet\", schema=schema\n    ).save()\nexcept ln.errors.ValidationError as error:\n    print(error)\n"
  },
  {
    "path": "docs/scripts/curate_dataframe_union_features.py",
    "content": "import lamindb as ln\nimport pandas as pd\n\nunion_feature = ln.Feature(\n    name=\"mixed_feature\",\n    dtype=\"cat[bionty.Tissue.ontology_id|bionty.CellType.ontology_id]\",\n).save()\n\ndf_mixed = pd.DataFrame({\"mixed_feature\": [\"UBERON:0000178\", \"CL:0000540\"]})\n\nschema = ln.Schema(features=[union_feature], coerce=True).save()\n\ncurator = ln.curators.DataFrameCurator(df_mixed, schema)\ncurator.validate()\n"
  },
  {
    "path": "docs/scripts/curate_mudata.py",
    "content": "import lamindb as ln\nimport bionty as bt\n\nfrom docs.scripts.define_schema_df_metadata import study_metadata_schema\n\n# define labels\nperturbation = ln.Record(name=\"Perturbation\", is_type=True).save()\nln.Record(name=\"Perturbed\", type=perturbation).save()\nln.Record(name=\"NT\", type=perturbation).save()\n\nreplicate = ln.Record(name=\"Replicate\", is_type=True).save()\nln.Record(name=\"rep1\", type=replicate).save()\nln.Record(name=\"rep2\", type=replicate).save()\nln.Record(name=\"rep3\", type=replicate).save()\n\n# define the global obs schema\nobs_schema = ln.Schema(\n    name=\"mudata_papalexi21_subset_obs_schema\",\n    features=[\n        ln.Feature(name=\"perturbation\", dtype=\"cat[Record[Perturbation]]\").save(),\n        ln.Feature(name=\"replicate\", dtype=\"cat[Record[Replicate]]\").save(),\n    ],\n).save()\n\n# define the ['rna'].obs schema\nobs_schema_rna = ln.Schema(\n    name=\"mudata_papalexi21_subset_rna_obs_schema\",\n    features=[\n        ln.Feature(name=\"nCount_RNA\", dtype=int).save(),\n        ln.Feature(name=\"nFeature_RNA\", dtype=int).save(),\n        ln.Feature(name=\"percent.mito\", dtype=float).save(),\n    ],\n).save()\n\n# define the ['hto'].obs schema\nobs_schema_hto = ln.Schema(\n    name=\"mudata_papalexi21_subset_hto_obs_schema\",\n    features=[\n        ln.Feature(name=\"nCount_HTO\", dtype=float).save(),\n        ln.Feature(name=\"nFeature_HTO\", dtype=int).save(),\n        ln.Feature(name=\"technique\", dtype=bt.ExperimentalFactor).save(),\n    ],\n).save()\n\n# define ['rna'].var schema\nvar_schema_rna = ln.Schema(\n    name=\"mudata_papalexi21_subset_rna_var_schema\",\n    itype=bt.Gene.symbol,\n    dtype=float,\n).save()\n\n# define composite schema\nmudata_schema = ln.Schema(\n    name=\"mudata_papalexi21_subset_mudata_schema\",\n    otype=\"MuData\",\n    slots={\n        \"obs\": obs_schema,\n        \"rna:obs\": obs_schema_rna,\n        \"hto:obs\": obs_schema_hto,\n        \"rna:var\": var_schema_rna,\n        \"uns:study_metadata\": study_metadata_schema,\n    },\n).save()\n\n# curate a MuData\nmdata = ln.examples.datasets.mudata_papalexi21_subset(with_uns=True)\nbt.settings.organism = \"human\"  # set the organism to map gene symbols\ncurator = ln.curators.MuDataCurator(mdata, mudata_schema)\nartifact = curator.save_artifact(key=\"examples/mudata_papalexi21_subset.h5mu\")\nassert artifact.schema == mudata_schema\n"
  },
  {
    "path": "docs/scripts/curate_soma_experiment.py",
    "content": "import lamindb as ln\nimport bionty as bt\nimport tiledbsoma as soma\nimport tiledbsoma.io\n\nadata = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\ntiledbsoma.io.from_anndata(\"small_dataset.tiledbsoma\", adata, measurement_name=\"RNA\")\n\nobs_schema = ln.Schema(\n    name=\"soma_obs_schema\",\n    features=[\n        ln.Feature(name=\"cell_type_by_expert\", dtype=bt.CellType).save(),\n        ln.Feature(name=\"cell_type_by_model\", dtype=bt.CellType).save(),\n    ],\n).save()\n\nvar_schema = ln.Schema(\n    name=\"soma_var_schema\",\n    features=[\n        ln.Feature(name=\"var_id\", dtype=bt.Gene.ensembl_gene_id).save(),\n    ],\n    coerce=True,\n).save()\n\nsoma_schema = ln.Schema(\n    name=\"soma_experiment_schema\",\n    otype=\"tiledbsoma\",\n    slots={\n        \"obs\": obs_schema,\n        \"ms:RNA.T\": var_schema,\n    },\n).save()\n\nwith soma.Experiment.open(\"small_dataset.tiledbsoma\") as experiment:\n    curator = ln.curators.TiledbsomaExperimentCurator(experiment, soma_schema)\n    curator.validate()\n    artifact = curator.save_artifact(\n        key=\"examples/soma_experiment.tiledbsoma\",\n        description=\"SOMA experiment with schema validation\",\n    )\nassert artifact.schema == soma_schema\nartifact.describe()\n"
  },
  {
    "path": "docs/scripts/curate_spatialdata.py",
    "content": "import lamindb as ln\n\nspatialdata = ln.examples.datasets.spatialdata_blobs()\nsdata_schema = ln.Schema.get(name=\"spatialdata_blobs_schema\")\ncurator = ln.curators.SpatialDataCurator(spatialdata, sdata_schema)\ntry:\n    curator.validate()\nexcept ln.errors.ValidationError:\n    pass\n\nspatialdata.tables[\"table\"].var.drop(index=\"ENSG00000999999\", inplace=True)\n\n# validate again (must pass now) and save artifact\nartifact = ln.Artifact.from_spatialdata(\n    spatialdata, key=\"examples/spatialdata1.zarr\", schema=sdata_schema\n).save()\nartifact.describe()\n"
  },
  {
    "path": "docs/scripts/define_schema_anndata_uns.py",
    "content": "import lamindb as ln\n\nfrom define_schema_df_metadata import study_metadata_schema\n\nanndata_uns_schema = ln.Schema(\n    otype=\"AnnData\",\n    slots={\n        \"uns:study_metadata\": study_metadata_schema,\n    },\n).save()\n"
  },
  {
    "path": "docs/scripts/define_schema_df_metadata.py",
    "content": "import lamindb as ln\n\nstudy_metadata_schema = ln.Schema(\n    name=\"Study metadata schema\",\n    features=[\n        ln.Feature(name=\"temperature\", dtype=float).save(),\n        ln.Feature(name=\"experiment\", dtype=str).save(),\n    ],\n).save()\n"
  },
  {
    "path": "docs/scripts/define_schema_spatialdata.py",
    "content": "import lamindb as ln\nimport bionty as bt\n\n# a very comprehensive schema for different slots of a SpatialData object\n\n# define or query features\nbio_dict = ln.Feature(name=\"bio\", dtype=dict).save()\ntech_dict = ln.Feature(name=\"tech\", dtype=dict).save()\ndisease = ln.Feature(name=\"disease\", dtype=bt.Disease, coerce=True).save()\ndevelopmental_stage = ln.Feature(\n    name=\"developmental_stage\",\n    dtype=bt.DevelopmentalStage,\n    coerce=True,\n).save()\nassay = ln.Feature(name=\"assay\", dtype=bt.ExperimentalFactor, coerce=True).save()\nsample_region = ln.Feature(name=\"sample_region\", dtype=str).save()\nanalysis = ln.Feature(name=\"analysis\", dtype=str).save()\n\n# define or query schema components\nattrs_schema = ln.Schema([bio_dict, tech_dict]).save()\nsample_schema = ln.Schema([disease, developmental_stage]).save()\ntech_schema = ln.Schema([assay]).save()\nobs_schema = ln.Schema([sample_region]).save()\nuns_schema = ln.Schema([analysis]).save()\n# enforces only registered Ensembl Gene IDs pass validation (maximal_set=True)\nvarT_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save()\n\n# compose the SpatialData schema\nsdata_schema = ln.Schema(\n    name=\"spatialdata_blobs_schema\",\n    otype=\"SpatialData\",\n    slots={\n        \"attrs:bio\": sample_schema,\n        \"attrs:tech\": tech_schema,\n        \"attrs\": attrs_schema,\n        \"tables:table:obs\": obs_schema,\n        \"tables:table:var.T\": varT_schema,\n    },\n).save()\n"
  },
  {
    "path": "docs/scripts/my_workflow.py",
    "content": "import lamindb as ln\n\n\n@ln.flow()\ndef ingest_dataset(key: str) -> ln.Artifact:\n    df = ln.examples.datasets.mini_immuno.get_dataset1()\n    artifact = ln.Artifact.from_dataframe(df, key=key).save()\n    return artifact\n\n\nif __name__ == \"__main__\":\n    ingest_dataset(key=\"my_analysis/dataset.parquet\")\n"
  },
  {
    "path": "docs/scripts/my_workflow_with_click.py",
    "content": "import click\nimport lamindb as ln\n\n\n@click.command()\n@click.option(\"--key\", required=True)\n@ln.flow()\ndef main(key: str):\n    df = ln.examples.datasets.mini_immuno.get_dataset2()\n    ln.Artifact.from_dataframe(df, key=key).save()\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "docs/scripts/my_workflow_with_step.py",
    "content": "import lamindb as ln\n\n\n@ln.step()\ndef subset_dataframe(\n    artifact: ln.Artifact,\n    subset_rows: int = 2,\n    subset_cols: int = 2,\n) -> ln.Artifact:\n    df = artifact.load()\n    new_data = df.iloc[:subset_rows, :subset_cols]\n    new_key = artifact.key.replace(\".parquet\", \"_subsetted.parquet\")\n    return ln.Artifact.from_dataframe(new_data, key=new_key).save()\n\n\n@ln.flow()\ndef ingest_dataset(key: str, subset: bool = False) -> ln.Artifact:\n    df = ln.examples.datasets.mini_immuno.get_dataset1()\n    artifact = ln.Artifact.from_dataframe(df, key=key).save()\n    if subset:\n        artifact = subset_dataframe(artifact)\n    return artifact\n\n\nif __name__ == \"__main__\":\n    ingest_dataset(key=\"my_analysis/dataset.parquet\", subset=True)\n"
  },
  {
    "path": "docs/scripts/run_script_with_step.py",
    "content": "import argparse\nimport lamindb as ln\n\n\n@ln.step()\ndef subset_dataframe(\n    artifact: ln.Artifact,\n    subset_rows: int = 2,\n    subset_cols: int = 2,\n    run: ln.Run | None = None,\n) -> ln.Artifact:\n    dataset = artifact.load(is_run_input=run)\n    new_data = dataset.iloc[:subset_rows, :subset_cols]\n    new_key = artifact.key.replace(\".parquet\", \"_subsetted.parquet\")\n    return ln.Artifact.from_dataframe(new_data, key=new_key, run=run).save()\n\n\nif __name__ == \"__main__\":\n    p = argparse.ArgumentParser()\n    p.add_argument(\"--subset\", action=\"store_true\")\n    args = p.parse_args()\n\n    params = {\"is_subset\": args.subset}\n\n    ln.track(params=params)\n\n    if args.subset:\n        df = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"DataFrame\")\n        artifact = ln.Artifact.from_dataframe(\n            df, key=\"my_analysis/dataset.parquet\"\n        ).save()\n        subsetted_artifact = subset_dataframe(artifact)\n\n    ln.finish()\n"
  },
  {
    "path": "docs/scripts/run_track_and_finish.py",
    "content": "import lamindb as ln\n\nln.track()  # initiate a tracked notebook/script run\n\n# your code automatically tracks inputs & outputs\n\nln.finish()  # mark run as finished, save execution report, source code & environment\n"
  },
  {
    "path": "docs/scripts/run_track_with_features_and_params.py",
    "content": "import argparse\nimport lamindb as ln\n\n\nif __name__ == \"__main__\":\n    p = argparse.ArgumentParser()\n    p.add_argument(\"--s3-folder\", type=str)\n    p.add_argument(\"--experiment\", type=str)\n    args = p.parse_args()\n    features = {\n        \"s3_folder\": args.s3_folder,\n        \"experiment\": args.experiment,\n    }\n    ln.track(features=features, params={\"example_param\": 42})\n\n    # your code\n\n    ln.finish()\n"
  },
  {
    "path": "docs/scripts/run_track_with_params.py",
    "content": "import argparse\nimport lamindb as ln\n\nif __name__ == \"__main__\":\n    p = argparse.ArgumentParser()\n    p.add_argument(\"--input-dir\", type=str)\n    p.add_argument(\"--downsample\", action=\"store_true\")\n    p.add_argument(\"--learning-rate\", type=float)\n    args = p.parse_args()\n    params = {\n        \"input_dir\": args.input_dir,\n        \"learning_rate\": args.learning_rate,\n        \"preprocess_params\": {\n            \"downsample\": args.downsample,\n            \"normalization\": \"the_good_one\",\n        },\n    }\n    ln.track(params=params)\n\n    # your code\n\n    ln.finish()\n"
  },
  {
    "path": "docs/scripts/synced_with_git.py",
    "content": "import lamindb as ln\n\nln.settings.sync_git_repo = \"https://github.com/...\"\nln.track()\n# your code\nln.finish()\n"
  },
  {
    "path": "docs/storage/add-replace-cache.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Add, replace, cache and delete artifacts\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"1\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import pytest\\n\",\n    \"import shutil\\n\",\n    \"import lamindb as ln\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"2\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ln.setup.login(\\\"testuser1\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"3\",\n   \"metadata\": {\n    \"tags\": [\n     \"hide-output\",\n     \"hide-cell\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"try:\\n\",\n    \"    root_path = ln.UPath(\\\"s3://lamindb-ci/test-add-replace-cache\\\")\\n\",\n    \"    if root_path.exists():\\n\",\n    \"        root_path.rmdir()\\n\",\n    \"    ln.setup.delete(\\\"testuser1/test-add-replace-cache\\\", force=True)\\n\",\n    \"except BaseException:  # noqa: S110\\n\",\n    \"    pass\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"4\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ln.setup.init(storage=\\\"s3://lamindb-ci/test-add-replace-cache\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"5\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Save with auto-managed (`key=None`)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"6\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"AUTO_KEY_PREFIX = ln.core.storage.paths.AUTO_KEY_PREFIX\\n\",\n    \"root = ln.settings.storage.root\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"7\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact = ln.Artifact(\\\"./test-files/iris.csv\\\", description=\\\"iris.csv\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"8\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"9\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"key_path = root / f\\\"{AUTO_KEY_PREFIX}{artifact.uid}{artifact.suffix}\\\"\\n\",\n    \"assert key_path.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"10\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"cache_csv_path = artifact.cache()\\n\",\n    \"print(cache_csv_path)\\n\",\n    \"assert cache_csv_path.suffix == \\\".csv\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"11\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.replace(\\\"./test-files/iris.data\\\")\\n\",\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"12\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"old_key_path = key_path\\n\",\n    \"new_key_path = root / f\\\"{AUTO_KEY_PREFIX}{artifact.uid}{artifact.suffix}\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"13\",\n   \"metadata\": {},\n   \"source\": [\n    \"The suffix changed:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"14\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(old_key_path)\\n\",\n    \"print(new_key_path)\\n\",\n    \"assert not old_key_path.exists()\\n\",\n    \"assert new_key_path.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"15\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"cache_data_path = artifact.cache()\\n\",\n    \"print(cache_data_path)\\n\",\n    \"assert cache_data_path.suffix == \\\".data\\\"\\n\",\n    \"assert cache_data_path.stat().st_mtime >= cache_csv_path.stat().st_mtime\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"16\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.delete(permanent=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"17\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Save with manually passed real `key`\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"18\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ln.settings.creation._artifact_use_virtual_keys = False\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"19\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact = ln.Artifact(\\\"./test-files/iris.csv\\\", key=\\\"iris.csv\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"20\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"21\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"key_path = root / \\\"iris.csv\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"22\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert key_path.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"23\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.replace(\\\"./test-files/new_iris.csv\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"24\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"25\",\n   \"metadata\": {},\n   \"source\": [\n    \"Check paths: no changes here, as the suffix didn't change.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"26\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"old_key_path = key_path\\n\",\n    \"new_key_path = root / \\\"new_iris.csv\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"27\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"old_key_path\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"28\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"new_key_path\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"29\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert old_key_path.exists()\\n\",\n    \"assert not new_key_path.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"30\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.replace(\\\"./test-files/iris.data\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"31\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"32\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"new_key_path = root / \\\"iris.data\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"33\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"old_key_path\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"34\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"new_key_path\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"35\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert not old_key_path.exists()\\n\",\n    \"assert new_key_path.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"36\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.delete(permanent=True, storage=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"37\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Save from memory\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"38\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import pandas as pd\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"39\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"iris = pd.read_csv(\\\"./test-files/iris.csv\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"40\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact = ln.Artifact.from_dataframe(\\n\",\n    \"    iris, description=\\\"iris_store\\\", key=\\\"iris.parquet\\\"\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"41\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"42\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"key_path = root / \\\"iris.parquet\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"43\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert key_path.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"44\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.replace(data=iris[:-1])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"45\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert artifact.key == \\\"iris.parquet\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"46\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"47\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert key_path.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"48\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.replace(\\\"./test-files/new_iris.csv\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"49\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"50\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"old_key_path = key_path\\n\",\n    \"new_key_path = root / \\\"iris.csv\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"51\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"old_key_path\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"52\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"new_key_path\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"53\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert not old_key_path.exists()\\n\",\n    \"assert new_key_path.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"54\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# we use the path in the next sections\\n\",\n    \"path_in_storage = artifact.path\\n\",\n    \"artifact.delete(permanent=True, storage=False)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"55\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Save with manually passed virtual `key`\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"56\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ln.settings.creation._artifact_use_virtual_keys = True\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"57\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact = ln.Artifact(\\\"./test-files/iris.csv\\\", key=\\\"iris.csv\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"58\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"59\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"with pytest.raises(ValueError) as err:\\n\",\n    \"    artifact.replace(path_in_storage)\\n\",\n    \"assert err.exconly().startswith(\\n\",\n    \"    \\\"ValueError: Can only replace with a local path not in any Storage.\\\"\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"60\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# return an existing artifact if the hash is the same\\n\",\n    \"assert artifact == artifact.replace(\\\"./test-files/iris.csv\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"61\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"fpath = artifact.path\\n\",\n    \"assert fpath.suffix == \\\".csv\\\" and fpath.stem == artifact.uid\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"62\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.replace(\\\"./test-files/iris.data\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"63\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"64\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert artifact.key == \\\"iris.data\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"65\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert not fpath.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"66\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"fpath = artifact.path\\n\",\n    \"assert fpath.suffix == \\\".data\\\" and fpath.stem == artifact.uid\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"67\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.delete(permanent=True, storage=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"68\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Save in existing storage with a virtual `key`\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"69\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact = ln.Artifact(path_in_storage, key=\\\"iris_test.csv\\\").save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"70\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert artifact._real_key.endswith(\\\"iris.csv\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"71\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.replace(\\\"./test-files/iris.data\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"72\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert artifact._real_key.endswith(\\\"iris.data\\\")\\n\",\n    \"assert artifact._clear_storagekey.endswith(\\\"iris.csv\\\")\\n\",\n    \"assert artifact.key == \\\"iris_test.data\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"73\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"74\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"path = artifact.path\\n\",\n    \"\\n\",\n    \"assert path.name == \\\"iris.data\\\"\\n\",\n    \"assert path.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"75\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert not path_in_storage.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"76\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.delete(permanent=True, storage=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"77\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Replace with folder artifacts\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"78\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata = ln.examples.datasets.anndata_pbmc68k_reduced()\\n\",\n    \"\\n\",\n    \"adata.write_zarr(\\\"./test-files/pbmc68k.zarr\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"79\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact = ln.Artifact(\\\"./test-files/pbmc68k.zarr\\\", key=\\\"pbmc68k.zarr\\\").save()\\n\",\n    \"save_hash = artifact.hash\\n\",\n    \"save_n_files = artifact.n_files\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"80\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"with pytest.raises(ValueError) as err:\\n\",\n    \"    artifact.replace(\\\"./test-files/iris.csv\\\")\\n\",\n    \"assert err.exconly().endswith(\\\"It is not allowed to replace a folder with a file.\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"81\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert save_hash is not None\\n\",\n    \"assert artifact.path.is_dir()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"82\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata.obs[\\\"add_new_col\\\"] = \\\"new\\\"\\n\",\n    \"\\n\",\n    \"adata.write_zarr(\\\"./test-files/pbmc68k_new.zarr\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"83\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.replace(\\\"./test-files/pbmc68k_new.zarr\\\")\\n\",\n    \"artifact.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"84\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert artifact.key == \\\"pbmc68k.zarr\\\"\\n\",\n    \"assert artifact.hash != save_hash\\n\",\n    \"assert artifact.n_files != save_n_files\\n\",\n    \"assert artifact.path.is_dir()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"85\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"shutil.rmtree(artifact.cache())\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"86\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"with artifact.open() as store:\\n\",\n    \"    assert \\\"add_new_col\\\" in store.obs\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"87\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# checks that .open above opened the cloud path without syncing\\n\",\n    \"assert not artifact._cache_path.exists()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"88\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"shutil.rmtree(\\\"./test-files/pbmc68k.zarr\\\")\\n\",\n    \"shutil.rmtree(\\\"./test-files/pbmc68k_new.zarr\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"89\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.delete(permanent=True, storage=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"90\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"ln.setup.delete(\\\"test-add-replace-cache\\\", force=True)\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3 (ipykernel)\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.10.16\"\n  },\n  \"nbproject\": {\n   \"id\": \"uBQMCcdYwEjA\",\n   \"parent\": null,\n   \"pypackage\": null,\n   \"time_init\": \"2023-04-04T16:26:17.675023+00:00\",\n   \"user_handle\": \"Koncopd\",\n   \"user_id\": \"qTQ5q0ar\",\n   \"user_name\": \"Sergei Rybakov\",\n   \"version\": \"0\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "docs/storage/anndata-accessor.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Test `AnnDataAccessor`\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import lamindb as ln\\n\",\n    \"\\n\",\n    \"ln.setup.init(storage=\\\"s3://lamindb-ci/test-anndata\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We'll need some test data:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"ln.Artifact(\\\"s3://lamindb-ci/test-anndata/pbmc68k.h5ad\\\").save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"An `h5ad` artifact stored on s3:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact = ln.Artifact.filter(key=\\\"pbmc68k.h5ad\\\").one()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.path\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata = artifact.open()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"It is possible to access `AnnData` attributes without loading them into memory\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"hide-cell\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"print(adata.obsm)\\n\",\n    \"print(adata.varm)\\n\",\n    \"print(adata.obsp)\\n\",\n    \"print(adata.varm)\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"However, `.obs`, `.var` and `.uns` are always loaded fully into memory on `AnnDataAccessor` initialization\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata.obs.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata.var.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata.uns.keys()\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Without subsetting, the `AnnDataAccessor` object gives references to underlying lazy `h5` or `zarr` arrays:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata.X\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata.obsm[\\\"X_pca\\\"]\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"And to a lazy `SparseDataset` from the `anndata` package:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata.obsp[\\\"distances\\\"]\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Get a subset of the object, attributes are loaded only on explicit access:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"obs_idx = adata.obs.cell_type.isin([\\\"Dendritic cells\\\", \\\"CD14+ Monocytes\\\"]) & (\\n\",\n    \"    adata.obs.percent_mito <= 0.05\\n\",\n    \")\\n\",\n    \"adata_subset = adata[obs_idx]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata_subset\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Check shapes of the subset\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"hide-cell\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"num_idx = sum(obs_idx)\\n\",\n    \"assert adata_subset.shape == (num_idx, adata.shape[1])\\n\",\n    \"assert (adata_subset.obs.cell_type == \\\"CD34+\\\").sum() == 0\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata_subset.obs.cell_type.value_counts()\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Subsets load the arrays into memory only on direct access\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(adata_subset.X)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(adata_subset.obsm[\\\"X_pca\\\"])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"hide-cell\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"assert adata_subset.obsp[\\\"distances\\\"].shape[0] == num_idx\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"To load the entire subset into memory as an actual `AnnData` object, use `to_memory()`:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"adata_subset.to_memory()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"hide-cell\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"!lamin delete --force test-anndata\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3 (ipykernel)\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.10.13\"\n  },\n  \"nbproject\": {\n   \"id\": \"YVUCtH4GfQOy\",\n   \"parent\": null,\n   \"pypackage\": null,\n   \"time_init\": \"2023-01-23T08:28:32.097943+00:00\",\n   \"user_handle\": \"testuser1\",\n   \"user_id\": \"DzTjkKse\",\n   \"user_name\": \"Test User1\",\n   \"version\": \"0\"\n  },\n  \"vscode\": {\n   \"interpreter\": {\n    \"hash\": \"ae1fefc8646a06dd2e75004cd934adda7c5727b046986a772e3b44b0ffba9754\"\n   }\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "docs/storage/prepare-sync-local-to-cloud.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Prepare sync artifacts from a local instance to a cloud instance\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!lamin disconnect\\n\",\n    \"# need to add pertdb to environment in order to import it\\n\",\n    \"!lamin settings modules set bionty,pertdb\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import lamindb as ln\\n\",\n    \"import bionty as bt\\n\",\n    \"import pertdb\\n\",\n    \"import pandas as pd\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ln.setup.init(storage=\\\"./test-sync-to-cloud\\\", modules=\\\"bionty,pertdb\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact = ln.Artifact.from_dataframe(\\n\",\n    \"    pd.DataFrame({\\\"a\\\": [1, 2, 3]}), description=\\\"test-sync-to-cloud\\\"\\n\",\n    \").save()\\n\",\n    \"features = bt.CellMarker.from_values(\\n\",\n    \"    [\\\"PD1\\\", \\\"CD21\\\"], field=bt.CellMarker.name, organism=\\\"human\\\"\\n\",\n    \").save()\\n\",\n    \"artifact.features._add_schema(ln.Schema(features), slot=\\\"var\\\")\\n\",\n    \"organism = bt.Organism.from_source(name=\\\"human\\\").save()\\n\",\n    \"artifact.labels.add(organism)\\n\",\n    \"compound = pertdb.Compound(name=\\\"compound-test-sync-to-cloud\\\").save()\\n\",\n    \"artifact.compounds.add(compound)\\n\",\n    \"artifact.describe()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert artifact.features.slots[\\\"var\\\"].members.count() == 2\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"py312\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.12.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "docs/storage/sync-local-to-cloud.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Sync artifacts from a local instance to a cloud instance\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import lamindb as ln\\n\",\n    \"\\n\",\n    \"ln.connect(\\\"laminlabs/lamin-dev\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"hide-cell\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"def cleanup(artifact: ln.Artifact):\\n\",\n    \"    features_sets = artifact.schemas.all()\\n\",\n    \"    compounds = artifact.compounds.all()\\n\",\n    \"    artifact.delete(permanent=True, storage=False)\\n\",\n    \"    features_sets.delete()\\n\",\n    \"    compounds.delete()\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"artifacts = ln.Artifact.filter(description=\\\"test-sync-to-cloud\\\")\\n\",\n    \"for artifact in artifacts:\\n\",\n    \"    cleanup(artifact)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact = ln.Artifact.connect(\\\"testuser1/test-sync-to-cloud\\\").get(\\n\",\n    \"    description=\\\"test-sync-to-cloud\\\"\\n\",\n    \")\\n\",\n    \"artifact.describe()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.save(transfer=\\\"annotations\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"artifact.describe()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert artifact._state.db == \\\"default\\\"\\n\",\n    \"assert artifact.organisms.get().name == \\\"human\\\"\\n\",\n    \"assert artifact.compounds.get().name == \\\"compound-test-sync-to-cloud\\\"\\n\",\n    \"assert artifact.features.slots[\\\"var\\\"].members.count() == 2\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"hide-cell\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"!rm -r ./test-sync-to-cloud\\n\",\n    \"!lamin delete --force test-sync-to-cloud\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"py312\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.12.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "docs/storage/test-files/iris.data",
    "content": "5.1,3.5,1.4,0.2,Iris-setosa\n4.9,3.0,1.4,0.2,Iris-setosa\n7.0,3.2,4.7,1.4,Iris-versicolor\n6.4,3.2,4.5,1.5,Iris-versicolor\n6.2,3.4,5.4,2.3,Iris-virginica\n5.9,3.0,5.1,1.8,Iris-virginica\n"
  },
  {
    "path": "docs/storage/test_notebooks.py",
    "content": "from pathlib import Path\n\nimport nbproject_test as test\n\nimport lamindb as ln\n\n\ndef test_notebooks():\n    nbdir = Path(__file__).parent\n    ln.setup.login(\"testuser1\")\n    test.execute_notebooks(nbdir, write=True)\n"
  },
  {
    "path": "docs/storage/upload.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Track artifacts, in-memory objects & folders [S3 storage]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"1\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import lamindb as ln\\n\",\n    \"import pytest\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"2\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ln.setup.login(\\\"testuser1\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"3\",\n   \"metadata\": {\n    \"tags\": [\n     \"hide-output\",\n     \"hide-cell\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"try:\\n\",\n    \"    root_path = ln.UPath(\\\"s3://lamindb-ci/test-upload\\\")\\n\",\n    \"    if root_path.exists():\\n\",\n    \"        root_path.rmdir()\\n\",\n    \"    ln.setup.delete(\\\"testuser1/test-upload\\\", force=True)\\n\",\n    \"except BaseException:  # noqa: S110\\n\",\n    \"    pass\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"4\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ln.setup.init(storage=\\\"s3://lamindb-ci/test-upload\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"5\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Local artifacts\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"6\",\n   \"metadata\": {},\n   \"source\": [\n    \"Some test data.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"7\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pbmc68k = ln.examples.datasets.anndata_pbmc68k_reduced()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"8\",\n   \"metadata\": {},\n   \"source\": [\n    \"Subset to a mini artifact to speed up the run time of this notebook:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"9\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pbmc68k = pbmc68k[:5, :5].copy()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"10\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pbmc68k\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"11\",\n   \"metadata\": {},\n   \"source\": [\n    \"###  Upload from memory using explicit semantic `key`\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"12\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Upload h5ad\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"13\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pbmc68k_h5ad = ln.Artifact.from_anndata(pbmc68k, key=\\\"test-upload/pbmc68k.h5ad\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"14\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pbmc68k_h5ad.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"15\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pbmc68k_h5ad.delete(permanent=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"16\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Upload zarr\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"17\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Runs too long, should be tested elsewhere\\n\",\n    \"# pbmc68k_zarr = ln.Artifact(pbmc68k, key=\\\"test-upload/pbmc68k.zarr\\\", format=\\\"zarr\\\")\\n\",\n    \"# ln.save(pbmc68k_zarr)\\n\",\n    \"# pbmc68k_zarr.delete(permanent=True, storage=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"18\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Upload using `id` with implicit `key`\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"19\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Upload h5ad\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"20\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pbmc68k_h5ad = ln.Artifact.from_anndata(pbmc68k, description=\\\"pbmc68k.h5ad\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"21\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pbmc68k_h5ad.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"22\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pbmc68k_h5ad.delete(permanent=True, storage=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"23\",\n   \"metadata\": {},\n   \"source\": [\n    \"#### Upload zarr\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"24\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Runs too long, should be tested elsewhere\\n\",\n    \"# pbmc68k_zarr = ln.Artifact(pbmc68k, name=\\\"pbmc68k.zarr\\\", format=\\\"zarr\\\")\\n\",\n    \"# ln.save(pbmc68k_zarr)\\n\",\n    \"# pbmc68k_zarr.delete(permanent=True, storage=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"25\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"source\": [\n    \"### Error behaviors\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"26\",\n   \"metadata\": {},\n   \"source\": [\n    \"Specified file does not exist.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"27\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"with pytest.raises(FileNotFoundError):\\n\",\n    \"    non_existent_h5ad = ln.Artifact(\\n\",\n    \"        \\\"s3://lamindb-ci/test-upload/non_existent_file.h5ad\\\"\\n\",\n    \"    )\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"28\",\n   \"metadata\": {},\n   \"source\": [\n    \"Specified buket does not exist. Normally non-existent bucket raises `FileNotFoundError`, but sometimes strarts to raise `PermissionError`.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"29\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"with pytest.raises((FileNotFoundError, PermissionError)):\\n\",\n    \"    non_existent_h5ad = ln.Artifact(\\n\",\n    \"        \\\"s3://non_existent_bucket_6612366/non_existent_file.h5ad\\\"\\n\",\n    \"    )\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"30\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Test existing zarr\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"31\",\n   \"metadata\": {},\n   \"source\": [\n    \"See `test_artifact.py` for other artifact types.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"32\",\n   \"metadata\": {},\n   \"source\": [\n    \"This should probably go elsewhere:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"33\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# temporarily comment out because of head bucket permission error when\\n\",\n    \"# attempting to get region\\n\",\n    \"# artifact = ln.Artifact(\\\"s3://lamindb-ci/lndb-storage/pbmc68k.zarr\\\")\\n\",\n    \"# artifact.save()\\n\",\n    \"# artifact.open()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"34\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ln.setup.delete(\\\"test-upload\\\", force=True)\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3 (ipykernel)\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.10.16\"\n  },\n  \"nbproject\": {\n   \"id\": \"psZgub4FOmzS\",\n   \"parent\": null,\n   \"pypackage\": null,\n   \"time_init\": \"2023-04-09T20:01:57.780053+00:00\",\n   \"user_handle\": \"testuser1\",\n   \"user_id\": \"DzTjkKse\",\n   \"user_name\": \"Test User1\",\n   \"version\": \"0\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "docs/storage/vitessce.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Vitessce integration\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"For more comprehensive integration tests, see: https://github.com/laminlabs/lamin-spatial\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!lamin login testuser1\\n\",\n    \"!lamin init --storage \\\"s3://lamindb-ci/test-vitessce\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import lamindb as ln\\n\",\n    \"import pytest\\n\",\n    \"from vitessce import (\\n\",\n    \"    VitessceConfig,\\n\",\n    \"    AnnDataWrapper,\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Set up test data:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"pbmc68k = ln.examples.datasets.anndata_pbmc68k_reduced()[:100, :200].copy()\\n\",\n    \"zarr_filepath = \\\"my_test.zarr\\\"\\n\",\n    \"# write the anndata to a local zarr path\\n\",\n    \"pbmc68k.write_zarr(zarr_filepath)\\n\",\n    \"# create an artifact from the path\\n\",\n    \"dataset_artifact = ln.Artifact(zarr_filepath, description=\\\"Test dataset\\\").save()\\n\",\n    \"# this is the where the zarr folder is located on a public S3 bucket\\n\",\n    \"dataset_artifact.path.to_url()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Create a `VitessceConfig` object: \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"tags\": [\n     \"hide-output\"\n    ]\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"vc = VitessceConfig(schema_version=\\\"1.0.15\\\")\\n\",\n    \"vc.add_dataset(name=\\\"test1\\\").add_object(\\n\",\n    \"    AnnDataWrapper(\\n\",\n    \"        adata_artifact=dataset_artifact,\\n\",\n    \"        obs_embedding_paths=[\\\"obsm/X_umap\\\"],\\n\",\n    \"    ),\\n\",\n    \")\\n\",\n    \"vc.to_dict()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"vitessce_config_artifact = ln.integrations.save_vitessce_config(\\n\",\n    \"    vc, description=\\\"View testdata in Vitessce\\\"\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# different equivalent ways of testing that the action is attached\\n\",\n    \"assert dataset_artifact._actions.get() == vitessce_config_artifact\\n\",\n    \"assert vitessce_config_artifact._action_targets.get() == dataset_artifact\\n\",\n    \"assert vitessce_config_artifact._actions.first() is None\\n\",\n    \"assert vitessce_config_artifact.kind == \\\"__lamindb_config__\\\"\\n\",\n    \"assert ln.Artifact.get(_actions=vitessce_config_artifact) == dataset_artifact\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"dataset_artifact.delete(permanent=True)\\n\",\n    \"vitessce_config_artifact.delete(permanent=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Test validation within `save_vitessce_config`:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# pass an artifact URL instead of the artifact object itself\\n\",\n    \"vc = VitessceConfig(schema_version=\\\"1.0.15\\\")\\n\",\n    \"with pytest.raises(AttributeError) as error:\\n\",\n    \"    vc.add_dataset(name=\\\"test1\\\").add_object(\\n\",\n    \"        AnnDataWrapper(\\n\",\n    \"            adata_artifact=dataset_artifact.path.to_url(),\\n\",\n    \"            obs_embedding_paths=[\\\"obsm/X_umap\\\"],\\n\",\n    \"        ),\\n\",\n    \"    )\\n\",\n    \"print(error.exconly())\\n\",\n    \"assert error.exconly().startswith(\\n\",\n    \"    \\\"AttributeError: 'str' object has no attribute 'path'\\\"\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!rm -rf test-vitessce\\n\",\n    \"!lamin delete --force test-vitessce\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"py312\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.12.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "docs/storage.md",
    "content": "# Storage\n\n```{toctree}\n:maxdepth: 1\n\nstorage/upload\nstorage/add-replace-cache\nstorage/anndata-accessor\nstorage/prepare-sync-local-to-cloud\nstorage/sync-local-to-cloud\nstorage/vitessce\n```\n"
  },
  {
    "path": "docs/sync.md",
    "content": "---\nexecute_via: python\n---\n\n# Sync data across databases\n\nThis guide shows how to sync objects from a source database to your default database.\n\nWe need a target database:\n\n```python\n!lamin init --storage ./test-sync --modules bionty\n```\n\nImport `lamindb` and optionally run `ln.track()`:\n\n```python\nimport lamindb as ln\n\nln.track()\n```\n\nSyncing works for any object type (`Artifact`, `Record`, `Transform`, `ULabel`, etc.). Let's sync an artifact to our current default database:\n\n```python\ndb = ln.DB(\"laminlabs/lamindata\")\n# query the artifact on the source database\nartifact = db.Artifact.get(key=\"example_datasets/mini_immuno/dataset1.h5ad\")\n# sync the artifact to the current database\nartifact.save()\n```\n\nIf you also want to sync feature & label annotations, pass `transfer=\"annotations\"`:\n\n```python\n# query again so that `artifact` holds the object on the source database\nartifact = db.Artifact.get(key=\"example_datasets/mini_immuno/dataset1.h5ad\")\n# sync the artifact to the current database, including transfer of annotations where necessary\nartifact.save(transfer=\"annotations\")\n```\n\nThe artifact now has all feature & label annotations:\n\n```python\nartifact.describe()\n```\n\nThe sync is zero-copy, which means that the data itself remained in the original storage location:\n\n```python\nartifact.path\n```\n\nData lineage indicates the source database of the sync:\n\n```python\nartifact.view_lineage()\n```\n\nThe run that initiated the sync is linked via `initiated_by_run`:\n\n```python\nartifact.run.initiated_by_run.transform\n```\n\nUpon calling `.save()` again, `lamindb` identifies that the object already exists in the target database and simply maps it:\n\n```python\nartifact = db.Artifact.get(key=\"example_datasets/mini_immuno/dataset1.h5ad\")\nartifact.save()\n```\n\n```{dropdown} How do I know if an object is in the default database or elsewhere?\n\nEvery `SQLRecord` object has an attribute `._state.db` which can take the following values:\n\n- `None`: the object has not yet been saved to any database\n- `\"default\"`: the object is saved on the default database instance\n- `\"account/name\"`: the object is saved on a non-default database instance referenced by `account/name` (e.g., `laminlabs/lamindata`)\n\n```\n\n```python tags=[\"hide-cell\"]\n# test the last 3 cells here\nassert artifact.transform.description == \"Transfer from `laminlabs/lamindata`\"\nassert artifact.transform.key == \"__lamindb_transfer__/4XIuR0tvaiXM\"\nassert artifact.transform.uid == \"4XIuR0tvaiXM0000\"\nassert artifact.run.initiated_by_run.transform.description.startswith(\"Sync data\")\n```\n"
  },
  {
    "path": "docs/test_notebooks.py",
    "content": "import sys\nfrom pathlib import Path\n\nimport nbproject_test as test\n\nsys.path[:0] = [str(Path(__file__).parent.parent)]\n\nfrom noxfile import GROUPS\n\nDOCS = Path(__file__).parents[1] / \"docs/\"\n\n\ndef test_tutorial():\n    for artifactname in GROUPS[\"tutorial\"]:\n        test.execute_notebooks(DOCS / artifactname, write=True)\n\n\ndef test_guide():\n    for artifactname in GROUPS[\"guide\"]:\n        test.execute_notebooks(DOCS / artifactname, write=True)\n\n\ndef test_tiledbsoma():\n    for artifactname in GROUPS[\"tiledbsoma\"]:\n        test.execute_notebooks(DOCS / artifactname, write=True)\n\n\ndef test_biology():\n    for artifactname in GROUPS[\"biology\"]:\n        test.execute_notebooks(DOCS / artifactname, write=True)\n"
  },
  {
    "path": "docs/track.md",
    "content": "---\nexecute_via: python\n---\n\n# Track notebooks, scripts & workflows\n\nThis guide walks from tracking data lineage in a notebook to tracking parameters in workflows.\n\n```{raw} html\n<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/jwnHu1PbA9Q?si=Eqn4dBZyFDrbcxvm\" title=\"YouTube video player\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen></iframe>\n```\n\n**Note:** To run examples, if you don't have a `lamindb` instance, create one:\n\n```python\n!lamin init --storage ./test-track\n```\n\n## Manage notebooks and scripts\n\nCall {meth}`~lamindb.track` to save your notebook or script as a `transform` and start tracking inputs & outputs of a run.\n\n```{eval-rst}\n.. literalinclude:: scripts/run_track_and_finish.py\n   :language: python\n```\n\n<!-- #region -->\n\nYou find your notebooks and scripts in the {class}`~lamindb.Transform` registry along with pipelines & functions:\n\n```python\ntransform = ln.Transform.get(key=\"my_analyses/my_notebook.ipynb\")\ntransform.source_code             # source code\ntransform.runs.to_dataframe()     # all runs in a dataframe\ntransform.latest_run.report       # report of latest run\ntransform.latest_run.environment  # environment of latest run\n```\n\n<!-- #endregion -->\n\n<!-- #region -->\n\nYou can use the CLI to load a transform into your current (development) directory:\n\n```bash\nlamin load --key my_analyses/my_notebook.ipynb\n```\n\n<!-- #endregion -->\n\n<!-- #region -->\n\nHere is how you'd load the [notebook from the video](https://lamin.ai/laminlabs/lamindata/transform/F4L3oC6QsZvQ) into your local directory:\n\n```bash\nlamin load https://lamin.ai/laminlabs/lamindata/transform/F4L3oC6QsZvQ\n```\n\n<!-- #endregion -->\n\n(sync-code-with-git)=\n\n### Organize local development\n\n<!-- #region -->\n\nIf no development directory is set, script & notebook keys equal their filenames.\nOtherwise, they represent the relative path in the development directory.\nThe exception is packaged source code, whose keys have the form `pypackages/{package_name}/path/to/file.py`.\n\nTo set the development directory to your current shell development directory, run:\n\n```bash\nlamin settings set dev-dir .\n```\n\nYou can see the current status by running:\n\n```bash\nlamin info\n```\n\nWhen you `cd` into that directory, you will now auto-connect to the configured lamindb instance.\n\nTo sync scripts or workflows with their correponding files in a git repo, either export an environment variable:\n\n```shell\nexport LAMINDB_SYNC_GIT_REPO = <YOUR-GIT-REPO-URL>\n```\n\nOr set the following setting:\n\n```python\nln.settings.sync_git_repo = <YOUR-GIT-REPO-URL>\n```\n\nIf you work on a single project in your lamindb instance, it makes sense to set LaminDB's `dev-dir` to the root of the local git repo clone.\n\n```bash\ndbs/\n  project1/\n    .git/\n    .lamin/\n    script1.py\n    notebook1.ipynb\n  ...\n```\n\nIf you work on multiple projects in your lamindb instance, you can use the `dev-dir` as the local root and nest git repositories in it.\n\n```bash\ndbs/\n  database1/\n    .lamin/\n    repo1/\n      .git/\n    repo2/\n      .git/\n  ...\n```\n\n<!-- #endregion -->\n\n### Use projects\n\nYou can link the entities created during a run to a project.\n\n```python\nimport lamindb as ln\n\nmy_project = ln.Project(name=\"My project\").save()  # create & save a project\nln.track(project=\"My project\")  # pass project\nopen(\"sample.fasta\", \"w\").write(\">seq1\\nACGT\\n\")  # create a dataset\nln.Artifact(\"sample.fasta\", key=\"sample.fasta\").save()  # auto-labeled by project\n```\n\nFilter entities by project, e.g., artifacts:\n\n```python\nln.Artifact.filter(projects=my_project).to_dataframe()\n```\n\nAccess entities linked to a project:\n\n```python\nmy_project.artifacts.to_dataframe()\n```\n\nThe same works for `my_project.transforms` or `my_project.runs`.\n\n### Use spaces\n\nYou can write the entities created during a run into a space that you configure on LaminHub. This is particularly useful if you want to restrict access to a space. Note that this doesn't affect bionty entities who should typically be commonly accessible.\n\n<!-- #region -->\n\n```python\nln.track(space=\"Our team space\")\n```\n\n<!-- #endregion -->\n\n### Track agent plans\n\n<!-- #region -->\n\nSaving an agent plan automatically tags with `artifact.kind = \"plan\"` and infers a `key` starting with `.plans/`:\n\n```bash\nlamin save /path/to/.cursor/plans/my_task.plan.md\nlamin save /path/to/.claude/plans/my_task.md\n```\n\nLink an agent plan against a run:\n\n```python\nln.track(plan=\".plans/my-agent-plan.md\")\n```\n\nThis links the `plan` artifact to a run in the same way as `transform`, an initiating run (`initiated_by_run`), and `report` / `environment` artifacts are linked to the run.\n\nWhile `transform` acts as the deterministic source code for the run and `initiated_by_run` enables higher-level runs in workflow orchestration, the agent `plan` complements these by linking a plan that steers a non-deterministic agent.\n\n<!-- #endregion -->\n\n(manage-workflows)=\n\n## Manage workflows\n\nHere we'll manage workflows with `lamindb`'s {func}`~lamindb.flow` and {func}`~lamindb.step` decorators, which works out-of-the-box with the majority of Python workflow managers:\n\n| tool      | workflow decorator | step/task decorator | notes                                          |\n| --------- | ------------------ | ------------------- | ---------------------------------------------- |\n| `lamindb` | `@flow`            | `@step`             | inspired by `prefect`                          |\n| `prefect` | `@flow`            | `@task`             | two decorators                                 |\n| `redun`   | `@task` (on main)  | `@task`             | single decorator for everything                |\n| `dagster` | `@job` or `@asset` | `@op` or `@asset`   | asset-centric; `@asset` is primary             |\n| `flyte`   | `@workflow`        | `@task`             | also `@dynamic` for runtime DAGs               |\n| `airflow` | `@dag`             | `@task`             | TaskFlow API (modern); also supports operators |\n| `zenml`   | `@pipeline`        | `@step`             | inspired by `prefect`                          |\n\nIf you're looking for more in-depth examples or for integrating with non-decorator-based workflow managers such as Nextflow or Snakemake, see {doc}`docs:pipelines`.\n\n| tool        | workflow           | step/task         | notes            |\n| ----------- | ------------------ | ----------------- | ---------------- |\n| `nextflow`  | `workflow` keyword | `process` keyword | groovy-based DSL |\n| `snakemake` | `rule` keyword     | `rule` keyword    | file-based DSL   |\n| `metaflow`  | `FlowSpec`         | `@step`           | class-based      |\n| `kedro`     | `Pipeline()`       | `node()`          | function-based   |\n\n### A one-step workflow\n\nDecorate a function with {func}`~lamindb.flow` to track it as a workflow:\n\n```{eval-rst}\n.. literalinclude:: scripts/my_workflow.py\n   :language: python\n   :caption: my_workflow.py\n```\n\nLet's run the workflow:\n\n```python\n!python scripts/my_workflow.py\n```\n\nQuery the workflow via its filename:\n\n```python\ntransform = ln.Transform.get(key=\"my_workflow.py\")\ntransform.describe()\n```\n\nThe run stored the parameter value for `key`:\n\n```python\ntransform.latest_run.describe()\n```\n\nIt links output artifacts:\n\n```python\ntransform.latest_run.output_artifacts.to_dataframe()\n```\n\nYou can query for all runs that ran with that parameter:\n\n```python\nln.Run.filter(\n    params__key=\"my_analysis/dataset.parquet\",\n).to_dataframe()\n```\n\nYou can also pass complex parameters and features, see: {ref}`track-run-parameters`.\n\n### A multi-step workflow\n\nHere, the workflow calls an additional processing step:\n\n```{eval-rst}\n.. literalinclude:: scripts/my_workflow_with_step.py\n   :language: python\n   :caption: my_workflow_with_step.py\n```\n\nLet's run the workflow:\n\n```python\n!python scripts/my_workflow_with_step.py\n```\n\nThe lineage of the subsetted artifact resolves the subsetting step:\n\n```python\nsubsetted_artifact = ln.Artifact.get(key=\"my_analysis/dataset_subsetted.parquet\")\nsubsetted_artifact.view_lineage()\n```\n\nThis is the run that created the subsetted_artifact:\n\n```python\nsubsetted_artifact.run\n```\n\nThis is the initating run that triggered the function call:\n\n```python\nsubsetted_artifact.run.initiated_by_run\n```\n\nThese are the parameters of the run:\n\n```python\nsubsetted_artifact.run.params\n```\n\nThese are the input artifacts:\n\n```python\nsubsetted_artifact.run.input_artifacts.to_dataframe()\n```\n\nThese are output artifacts:\n\n```python\nsubsetted_artifact.run.output_artifacts.to_dataframe()\n```\n\n### A workflow with CLI arguments\n\nLet's use `click` to parse CLI arguments:\n\n```{eval-rst}\n.. literalinclude:: scripts/my_workflow_with_click.py\n   :language: python\n   :caption: my_workflow_with_click.py\n```\n\nLet's run the workflow:\n\n```python\n!python scripts/my_workflow_with_click.py --key my_analysis/dataset2.parquet\n```\n\nCLI arguments are tracked and accessible via `run.cli_args`:\n\n```python\nrun = ln.Run.filter(transform__key=\"my_workflow_with_click.py\").first()\nrun.describe()\n```\n\nNote that it doesn't matter whether you use `click`, `argparse`, or any other CLI argument parser.\n\n(track-run-parameters)=\n\n## Track parameters & features\n\nWe just saw that the function decorators `@ln.flow()` and `@ln.step()` track parameter values automatically. Here is how to pass parameters to `ln.track()`:\n\n```{eval-rst}\n.. literalinclude:: scripts/run_track_with_params.py\n   :language: python\n   :caption: run_track_with_params.py\n```\n\nRun the script.\n\n```python\n!python scripts/run_track_with_params.py  --input-dir ./mydataset --learning-rate 0.01 --downsample\n```\n\nQuery for all runs that match certain parameters:\n\n```python\nln.Run.filter(\n    params__learning_rate=0.01,\n    params__preprocess_params__downsample=True,\n).to_dataframe()\n```\n\nDescribe & get parameters:\n\n```python\nrun = ln.Run.filter(params__learning_rate=0.01).order_by(\"-started_at\").first()\nrun.describe()\nrun.params\n```\n\nYou can also access the CLI arguments used to start the run directly:\n\n```python\nrun.cli_args\n```\n\nYou can also track run features in analogy to artifact features.\n\nIn contrast to params, features are validated against the `Feature` registry and allow to express relationships with entities in your registries.\n\nLet's first define labels & features.\n\n```python\nexperiment_type = ln.Record(name=\"Experiment\", is_type=True).save()\nexperiment_label = ln.Record(name=\"Experiment1\", type=experiment_type).save()\nln.Feature(name=\"s3_folder\", dtype=str).save()\nln.Feature(name=\"experiment\", dtype=experiment_type).save()\n```\n\n```python\n!python scripts/run_track_with_features_and_params.py  --s3-folder s3://my-bucket/my-folder --experiment Experiment1\n```\n\n```python\nln.Run.filter(s3_folder=\"s3://my-bucket/my-folder\").to_dataframe()\n```\n\nDescribe & get feature values.\n\n```python\nrun2 = ln.Run.filter(\n    s3_folder=\"s3://my-bucket/my-folder\", experiment=\"Experiment1\"\n).last()\nrun2.describe()\nrun2.features.get_values()\n```\n\n## Manage functions in scripts and notebooks\n\nIf you want more-fined-grained data lineage tracking in a script or notebook where you called `ln.track()`, you can also use the `step()` decorator.\n\n### In a notebook\n\n```python\n@ln.step()\ndef subset_dataframe(\n    input_artifact_key: str,\n    output_artifact_key: str,\n    subset_rows: int = 2,\n    subset_cols: int = 2,\n) -> None:\n    artifact = ln.Artifact.get(key=input_artifact_key)\n    dataset = artifact.load()\n    new_data = dataset.iloc[:subset_rows, :subset_cols]\n    ln.Artifact.from_dataframe(new_data, key=output_artifact_key).save()\n```\n\nPrepare a test dataset:\n\n```python\ndf = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"DataFrame\")\ninput_artifact_key = \"my_analysis/dataset.parquet\"\nartifact = ln.Artifact.from_dataframe(df, key=input_artifact_key).save()\n```\n\nRun the function with default params:\n\n```python\nouput_artifact_key = input_artifact_key.replace(\".parquet\", \"_subsetted.parquet\")\nsubset_dataframe(input_artifact_key, ouput_artifact_key, subset_rows=1)\n```\n\nQuery for the output:\n\n```python\nsubsetted_artifact = ln.Artifact.get(key=ouput_artifact_key)\nsubsetted_artifact.view_lineage()\n```\n\nRe-run the function with a different parameter:\n\n```python\nsubsetted_artifact = subset_dataframe(\n    input_artifact_key, ouput_artifact_key, subset_cols=3\n)\nsubsetted_artifact = ln.Artifact.get(key=ouput_artifact_key)\nsubsetted_artifact.view_lineage()\n```\n\nWe created a new run:\n\n```python\nsubsetted_artifact.run\n```\n\nWith new parameters:\n\n```python\nsubsetted_artifact.run.params\n```\n\nAnd a new version of the output artifact:\n\n```python\nsubsetted_artifact.run.output_artifacts.to_dataframe()\n```\n\n### In a script\n\n```{eval-rst}\n.. literalinclude:: scripts/run_script_with_step.py\n   :language: python\n   :caption: run_script_with_step.py\n```\n\n```python\n!python scripts/run_script_with_step.py --subset\n```\n\n```python\nln.view()\n```\n\n## The database\n\nSee the state of the database after we ran these different examples:\n\n```python\nln.view()\n```\n\n## Using transform versions as templates\n\n<!-- #region -->\n\nA transform acts like a template upon using `lamin load` to load it. Consider you run:\n\n```bash\nlamin load https://lamin.ai/account/instance/transform/Akd7gx7Y9oVO0000\n```\n\nUpon running the returned notebook or script, you'll automatically create a new version and be able to browse it via the version dropdown on the UI.\n\nAdditionally, you can:\n\n- label using `ULabel` or `Record`, e.g., `transform.records.add(template_label)`\n- tag with an indicative `version` string, e.g., `transform.version = \"T1\"; transform.save()`\n<!-- #endregion -->\n\n<!-- #region -->\n\n:::{dropdown} Saving a notebook as an artifact\n\nSometimes you might want to save a notebook as an artifact. This is how you can do it:\n\n```bash\nlamin save template1.ipynb --key templates/template1.ipynb --description \"Template for analysis type 1\" --registry artifact\n```\n\n:::\n\n<!-- #endregion -->\n\nA few checks at the end of this notebook:\n\n```python\nassert run.params == {\n    \"input_dir\": \"./mydataset\",\n    \"learning_rate\": 0.01,\n    \"preprocess_params\": {\"downsample\": True, \"normalization\": \"the_good_one\"},\n}, run.params\nassert my_project.artifacts.exists()\nassert my_project.transforms.exists()\nassert my_project.runs.exists()\n```\n"
  },
  {
    "path": "lamindb/__init__.py",
    "content": "\"\"\"A data framework for biology.\n\nInstallation::\n\n   pip install lamindb\n\nIf you just want to *read* data from a LaminDB instance, use :class:`~lamindb.DB`::\n\n   import lamindb as ln\n\n   db = ln.DB(\"laminlabs/cellxgene\")\n\nTo *write* data, connect to a writable instance::\n\n   lamin login\n   lamin connect account/name\n\nYou can create an instance at `lamin.ai <https://lamin.ai>`__ and invite collaborators.\nIf you prefer to work with a local database (no login required), run::\n\n    lamin init --storage ./quickstart-data --modules bionty\n\nLaminDB will then auto-connect upon import and you can then create & save objects like this::\n\n   import lamindb as ln\n   # → connected lamindb: account/instance\n\n   ln.Artifact(\"./my_dataset.parquet\", key=\"datasets/my_dataset.parquet\").save()\n\nLineage\n=======\n\nTrack inputs, outputs, parameters, and environments of notebooks, scripts, and functions.\n\n.. autosummary::\n   :toctree: .\n\n   track\n   finish\n   flow\n   step\n\nArtifacts\n=========\n\nThe central `Artifact` registry holds files, folders & arrays across any number of storage locations.\n\n.. autosummary::\n   :toctree: .\n\n   Artifact\n\nAll other registries link to `Artifact` to provide context for finding, querying, validating, and managing artifacts.\nHere is an overview of the core data model:\n\n.. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/HMfWLa1rFkxcxQEN0000.svg\n    :width: 800px\n\n\nTransforms & runs\n=================\n\nData transformations and their executions.\n\n.. autosummary::\n   :toctree: .\n\n   Transform\n   Run\n\nRecords, labels, features & schemas\n===================================\n\nCreate labels and manage flexible records, e.g., for samples or donors.\n\n.. autosummary::\n   :toctree: .\n\n   Record\n   ULabel\n\nDefine features & schemas to validate artifacts & records.\n\n.. autosummary::\n   :toctree: .\n\n   Feature\n   Schema\n\nManaging operations\n===================\n\n.. autosummary::\n   :toctree: .\n\n   Project\n   Storage\n   User\n   Branch\n   Space\n   Collection\n   Reference\n\nBasic utilities\n===============\n\nConnecting, viewing database content, accessing settings & run context.\n\n.. autosummary::\n   :toctree: .\n\n   DB\n   connect\n   view\n   save\n   UPath\n   settings\n   context\n\nCurators and integrations\n=========================\n\n.. autosummary::\n   :toctree: .\n\n   curators\n   integrations\n\nExamples, errors & setup\n========================\n\n.. autosummary::\n   :toctree: .\n\n   examples\n   errors\n   setup\n\nDeveloper API\n=============\n\n.. autosummary::\n   :toctree: .\n\n   base\n   core\n   models\n\n\"\"\"\n\n# ruff: noqa: I001\n# denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.\n__version__ = \"2.4.2\"\n\nimport warnings as _warnings\n\n# through SpatialData\n_warnings.filterwarnings(\n    \"ignore\", message=\"The legacy Dask DataFrame implementation is deprecated\"\n)\n\nfrom lamindb_setup._check_setup import _check_instance_setup\nfrom lamindb_setup._connect_instance import connect\nfrom lamindb_setup.core.upath import UPath\n\nfrom . import base, errors, setup\n\n_check_instance_setup(from_module=\"lamindb\")\n\nfrom .core._functions import flow, step, tracked\nfrom ._view import view\nfrom .core._context import context\nfrom .core._settings import settings\nfrom .models import (\n    Artifact,\n    Collection,\n    Feature,\n    Project,\n    Reference,\n    Run,\n    Schema,\n    Storage,\n    Transform,\n    ULabel,\n    User,\n    Space,\n    Branch,\n    Record,\n    DB,\n)\nfrom .models.save import save\nfrom . import core\nfrom . import integrations\nfrom . import curators\nfrom . import examples\n\ntrack = context._track\nfinish = context._finish\nsettings.__doc__ = \"\"\"Global live settings (:class:`~lamindb.core.Settings`).\"\"\"\ncontext.__doc__ = \"\"\"Global run context (:class:`~lamindb.core.Context`).\"\"\"\n\nfrom django.db.models import Q\n\nParam = Feature  # backward compat\n\n__all__ = [\n    # data lineage\n    \"track\",\n    \"finish\",\n    \"step\",\n    \"flow\",\n    # registries\n    \"Artifact\",\n    \"Storage\",\n    \"Transform\",\n    \"Run\",\n    \"Feature\",\n    \"ULabel\",\n    \"Schema\",\n    \"Record\",\n    \"User\",\n    \"Collection\",\n    \"Project\",\n    \"Space\",\n    \"Branch\",\n    \"Reference\",\n    # other\n    \"connect\",\n    \"view\",\n    \"save\",\n    \"UPath\",\n    \"settings\",\n    \"context\",\n    \"DB\",\n    # curators and integrations\n    \"curators\",\n    \"integrations\",\n    # examples, errors, setup\n    \"examples\",\n    \"errors\",\n    \"setup\",\n    # low-level functionality\n    \"base\",\n    \"core\",\n    \"models\",\n]\n"
  },
  {
    "path": "lamindb/_finish.py",
    "content": "from __future__ import annotations\n\nimport builtins\nimport re\nfrom datetime import datetime, timezone\nfrom time import sleep\nfrom typing import TYPE_CHECKING\n\nimport lamindb_setup as ln_setup\nfrom lamin_utils import logger\nfrom lamin_utils._logger import LEVEL_TO_COLORS, LEVEL_TO_ICONS, RESET_COLOR\nfrom lamindb_setup.core.hashing import hash_dir, hash_file\n\nfrom lamindb.models import Artifact, Run, Transform\n\nis_run_from_ipython = getattr(builtins, \"__IPYTHON__\", False)\n\nif TYPE_CHECKING:\n    from pathlib import Path\n\n\ndef get_save_notebook_message() -> str:\n    # do not add bold() or any other complicated characters as then we can't match this\n    # easily anymore in an html to strip it out\n    return f\"please hit {get_shortcut()} to save the notebook in your editor\"\n\n\ndef get_save_notebook_message_retry() -> str:\n    return f\"{get_save_notebook_message()} and re-run finish()\"\n\n\n# this code was originally in nbproject by the same authors\ndef check_consecutiveness(\n    nb, calling_statement: str = None, silent_success: bool = True\n) -> bool:\n    \"\"\"Check whether code cells have been executed consecutively.\n\n    Needs to be called in the last code cell of a notebook.\n    Otherwise raises `RuntimeError`.\n\n    Returns cell transitions that violate execution at increments of 1 as a list\n    of tuples.\n\n    Args:\n        nb: Notebook content.\n        calling_statement: The statement that calls this function.\n    \"\"\"\n    cells = nb.cells\n\n    violations = []\n    prev = 0\n\n    ccount = 0  # need to initialize because notebook might note have code cells\n    # and below, we check if ccount is None\n    for cell in cells:\n        cell_source = \"\".join(cell[\"source\"])\n        if cell[\"cell_type\"] != \"code\" or cell_source == \"\":\n            continue\n\n        if calling_statement is not None and calling_statement in cell_source:\n            continue\n\n        ccount = cell[\"execution_count\"]\n        if ccount is None or prev is None or ccount - prev != 1:\n            violations.append((prev, ccount))\n\n        prev = ccount\n\n    # ignore the very last code cell of the notebook\n    # `check_consecutiveness` is being run during publish if `last_cell`` is True\n    # hence, that cell has ccount is None\n    if ccount is None:\n        violations.pop()\n\n    any_violations = len(violations) > 0\n    if any_violations:\n        logger.warning(f\"cells {violations} were not run consecutively\")\n    elif not silent_success:\n        logger.success(\"cell execution numbers increase consecutively\")\n\n    return not any_violations\n\n\ndef get_shortcut() -> str:\n    import platform\n\n    return \"CMD + s\" if platform.system() == \"Darwin\" else \"CTRL + s\"\n\n\ndef get_seconds_since_modified(filepath) -> float:\n    return datetime.now().timestamp() - filepath.stat().st_mtime\n\n\ndef save_run_logs(run: Run, save_run: bool = False) -> None:\n    logs_path = ln_setup.settings.cache_dir / f\"run_logs_{run.uid}.txt\"\n    if logs_path.exists():\n        if run.report is not None:\n            logger.important(\"overwriting run.report\")\n        artifact = Artifact(  # type: ignore\n            logs_path,\n            description=f\"log streams of run {run.uid}\",\n            kind=\"__lamindb_run__\",\n            run=False,\n        )\n        artifact.save(upload=True, print_progress=False)\n        run.report = artifact\n        if save_run:  # defaults to false because is slow\n            run.save()\n\n\n# this is from the get_title function in nbproject\n# should be moved into lamindb sooner or later\ndef prepare_notebook(\n    nb,\n    strip_title: bool = False,\n) -> str | None:\n    title_found = False\n    for cell in nb.cells:\n        cell.metadata.clear()  # strip cell metadata\n        if not title_found and cell[\"cell_type\"] == \"markdown\":\n            lines = cell[\"source\"].split(\"\\n\")\n            for i, line in enumerate(lines):\n                if line.startswith(\"# \"):\n                    line.lstrip(\"#\").strip(\" .\").strip()\n                    title_found = True\n                    if strip_title:\n                        lines.pop(i)\n                        cell[\"source\"] = \"\\n\".join(lines)\n        # strip logging message about saving notebook in editor\n        # this is normally the last cell\n        if cell[\"cell_type\"] == \"code\" and \".finish(\" in cell[\"source\"]:\n            for output in cell[\"outputs\"]:\n                if \"to save the notebook in your editor\" in output.get(\"text\", \"\"):\n                    cell[\"outputs\"] = []\n                    break\n    return None\n\n\ndef notebook_to_report(notebook_path: Path, output_path: Path) -> None:\n    import nbformat\n    import traitlets.config as config\n    from nbconvert import HTMLExporter\n\n    with open(notebook_path, encoding=\"utf-8\") as f:\n        notebook = nbformat.read(f, as_version=4)\n    prepare_notebook(notebook, strip_title=True)\n    notebook.metadata.clear()  # strip notebook metadata\n    # if we were to export as ipynb, the following two lines would do it\n    # with open(output_path, \"w\", encoding=\"utf-8\") as f:\n    #     nbformat.write(notebook, f)\n    # instead we need all this code\n    c = config.Config()\n    c.HTMLExporter.preprocessors = []\n    c.HTMLExporter.exclude_input_prompt = True\n    c.HTMLExporter.exclude_output_prompt = True\n    c.HTMLExporter.anchor_link_text = \" \"\n    html_exporter = HTMLExporter(config=c)\n    html, _ = html_exporter.from_notebook_node(notebook)\n    output_path.write_text(html, encoding=\"utf-8\")\n\n\ndef notebook_to_script(  # type: ignore\n    title: str, notebook_path: Path, script_path: Path | None = None\n) -> None | str:\n    import jupytext\n\n    notebook = jupytext.read(notebook_path)\n    notebook.metadata.clear()\n    py_content = jupytext.writes(notebook, fmt=\"py:percent\")\n    # remove global metadata header\n    py_content = re.sub(r\"^# ---\\n.*?# ---\\n\\n\", \"\", py_content, flags=re.DOTALL)\n    # replace title\n    py_content = py_content.replace(f\"# # {title}\", \"#\")\n    if script_path is None:\n        return py_content\n    else:\n        script_path.write_text(py_content, encoding=\"utf-8\")\n\n\ndef clean_r_notebook_html(file_path: Path) -> tuple[str | None, Path]:\n    import re\n\n    cleaned_content = file_path.read_text()\n    # remove title from content\n    pattern_title = r\"<title>(.*?)</title>\"\n    title_match = re.search(pattern_title, cleaned_content)\n    title_text = None\n    if title_match:\n        title_text = title_match.group(1)\n        pattern_h1 = f\"<h1[^>]*>{re.escape(title_text)}</h1>\"\n        cleaned_content = re.sub(pattern_title, \"\", cleaned_content)\n        cleaned_content = re.sub(pattern_h1, \"\", cleaned_content)\n    # remove error message from content\n    if \"to save the notebook in your editor\" in cleaned_content:\n        orig_error_message = f\"! {get_save_notebook_message_retry()}\"\n        # coming up with the regex for this is a bit tricky due to all the\n        # escape characters we'd need to insert into the message; hence,\n        # we do this with a replace() instead\n        cleaned_content = cleaned_content.replace(orig_error_message, \"\")\n        if \"to save the notebook in your editor\" in cleaned_content:\n            orig_error_message = orig_error_message.replace(\n                \" finish()\", \"\\nfinish()\"\n            )  # RStudio might insert a newline\n            cleaned_content = cleaned_content.replace(orig_error_message, \"\")\n    cleaned_path = file_path.parent / (f\"{file_path.stem}.cleaned{file_path.suffix}\")\n    cleaned_path.write_text(cleaned_content, encoding=\"utf-8\")\n    return title_text, cleaned_path\n\n\ndef check_filepath_recently_saved(filepath: Path, is_finish_retry: bool) -> bool:\n    # the recently_saved_time needs to be very low for the first check\n    # because an accidental save (e.g. via auto-save) might otherwise lead\n    # to upload of an outdated notebook\n    # also see implementation for R notebooks below\n    offset_saved_time = 0.3 if not is_finish_retry else 20\n    for retry in range(30):\n        recently_saved_time = offset_saved_time + retry  # sleep time is 1 sec\n        if get_seconds_since_modified(filepath) > recently_saved_time:\n            if retry == 0:\n                prefix = f\"{LEVEL_TO_COLORS[20]}{LEVEL_TO_ICONS[20]}{RESET_COLOR}\"\n                print(f\"{prefix} {get_save_notebook_message()}\", end=\" \")\n            elif retry == 9:\n                print(\".\", end=\"\\n\")\n            elif retry == 4:\n                print(\". still waiting \", end=\"\")\n            else:\n                print(\".\", end=\"\")\n            sleep(1)\n        else:\n            if retry > 0:\n                prefix = f\"{LEVEL_TO_COLORS[25]}{LEVEL_TO_ICONS[25]}{RESET_COLOR}\"\n                print(f\" {prefix}\")\n            # filepath was recently saved, return True\n            return True\n    # if we arrive here, no save event occured, return False\n    return False\n\n\ndef save_context_core(\n    *,\n    run: Run | None,\n    transform: Transform,\n    filepath: Path,\n    finished_at: bool = False,\n    skip_save_report: bool = False,\n    ignore_non_consecutive: bool | None = None,\n    from_cli: bool = False,\n    is_retry: bool = False,\n    notebook_runner: str | None = None,\n    message_prefix: str = \"go to\",\n) -> str | None:\n    import lamindb as ln\n    from lamindb.models import (\n        format_field_value,  # needs to come after lamindb was imported because of CLI use\n    )\n\n    ln.settings.verbosity = \"success\"\n\n    # for scripts, things are easy\n    is_consecutive = True\n    is_ipynb = filepath.suffix == \".ipynb\"\n    is_r_notebook = filepath.suffix in {\".qmd\", \".Rmd\"}\n    source_code_path = filepath\n    report_path: Path | None = None\n    save_source_code_and_report = filepath.exists()\n    if (\n        is_run_from_ipython and notebook_runner != \"nbconvert\" and filepath.exists()\n    ):  # python notebooks in interactive session\n        if is_ipynb:\n            # ignore this for py:percent notebooks\n            import nbproject\n\n            # it might be that the user modifies the title just before ln.finish()\n            if (nbproject_title := nbproject.meta.live.title) != transform.description:\n                transform.description = nbproject_title\n                transform.save()\n        if not ln_setup._TESTING:\n            save_source_code_and_report = check_filepath_recently_saved(\n                filepath, is_retry\n            )\n            if not save_source_code_and_report and not is_retry:\n                logger.warning(get_save_notebook_message_retry())\n                return \"retry\"\n            elif not save_source_code_and_report:\n                logger.warning(\n                    \"the notebook on disk wasn't saved within the last 10 sec\"\n                )\n    if is_ipynb and filepath.exists():  # could be from CLI outside interactive session\n        try:\n            import jupytext  # noqa: F401\n            from nbproject.dev import (\n                read_notebook,\n            )\n        except ImportError:\n            logger.error(\"install nbproject & jupytext: pip install nbproject jupytext\")\n            return None\n        notebook_content = read_notebook(filepath)  # type: ignore\n        if not ignore_non_consecutive:  # ignore_non_consecutive is None or False\n            is_consecutive = check_consecutiveness(\n                notebook_content, calling_statement=\".finish(\"\n            )\n            if not is_consecutive:\n                response = \"n\"  # ignore_non_consecutive == False\n                if ignore_non_consecutive is None:  # only print warning\n                    response = \"y\"  # we already printed the warning\n                else:  # ask user to confirm\n                    response = input(\n                        \"   Do you still want to proceed with finishing? (y/n) \"\n                    )\n                if response != \"y\":\n                    return \"aborted-non-consecutive\"\n        # write the report\n        report_path = ln_setup.settings.cache_dir / filepath.name.replace(\n            \".ipynb\", \".html\"\n        )\n        notebook_to_report(filepath, report_path)\n        # write the source code\n        source_code_path = ln_setup.settings.cache_dir / filepath.name.replace(\n            \".ipynb\", \".py\"\n        )\n        notebook_to_script(transform.description, filepath, source_code_path)\n    elif is_ipynb and not filepath.exists():\n        logger.warning(\"notebook file does not exist in compute environment\")\n    elif is_r_notebook:\n        if filepath.with_suffix(\".nb.html\").exists():\n            report_path = filepath.with_suffix(\".nb.html\")\n        elif filepath.with_suffix(\".html\").exists():\n            report_path = filepath.with_suffix(\".html\")\n        else:\n            logger.warning(\n                f\"no html report found; to attach one, create an .html export for your {filepath.suffix} file and then run: lamin save {filepath}\"\n            )\n    if report_path is not None and is_r_notebook and not from_cli:  # R notebooks\n        # see comment above in check_filepath_recently_saved\n        recently_saved_time = 0.3 if not is_retry else 20\n        if get_seconds_since_modified(report_path) > recently_saved_time:\n            # the automated retry solution of Jupyter notebooks does not work in RStudio because the execution of the notebook cell\n            # seems to block the event loop of the frontend\n            if not is_retry:\n                logger.warning(get_save_notebook_message_retry())\n                return \"retry\"\n            else:\n                logger.warning(\n                    \"the notebook on disk hasn't been saved within the last 20 sec\"\n                )\n            save_source_code_and_report = False\n    ln.settings.creation.artifact_silence_missing_run_warning = True\n    # save source code\n    if save_source_code_and_report:\n        return_code = transform._update_source_code_from_path(source_code_path)\n        if return_code == \"rerun-the-notebook\":\n            return \"rerun-the-notebook\"\n    if run is not None:\n        base_path = ln_setup.settings.cache_dir / \"environments\" / f\"run_{run.uid}\"\n        paths = [base_path / \"run_env_pip.txt\", base_path / \"r_environment.txt\"]\n        existing_paths = [path for path in paths if path.exists()]\n        if len(existing_paths) == 2:\n            # let's not store the python environment for an R session for now\n            existing_paths = [base_path / \"r_environment.txt\"]\n\n        if existing_paths:\n            overwrite_env = True\n            if run.environment_id is not None and from_cli:\n                logger.important(\"run.environment is already saved, ignoring\")\n                overwrite_env = False\n\n            if overwrite_env:\n                # Use directory if multiple files exist, otherwise use the single file\n                artifact_path: Path = (\n                    base_path if len(existing_paths) > 1 else existing_paths[0]\n                )\n\n                # Set description based on what we're saving\n                if len(existing_paths) == 1:\n                    if existing_paths[0].name == \"run_env_pip.txt\":\n                        description = \"requirements.txt\"\n                    elif existing_paths[0].name == \"r_environment.txt\":\n                        description = \"r_environment.txt\"\n                    size, env_hash, _ = hash_file(artifact_path)\n                else:\n                    description = \"environments\"\n                    size, env_hash, _, _ = hash_dir(artifact_path)\n\n                artifact = (\n                    ln.Artifact.objects.filter(hash=env_hash)\n                    .exclude(\n                        size=0\n                    )  # exclude empty files, which may occur for one reason or another\n                    .one_or_none()\n                )\n                new_env_artifact = artifact is None\n\n                if new_env_artifact:\n                    if size > 0:\n                        artifact = ln.Artifact(\n                            artifact_path,\n                            description=description,\n                            kind=\"__lamindb_run__\",\n                            run=False,\n                        )\n                        artifact.save(upload=True, print_progress=False)\n                    else:\n                        logger.warning(\n                            \"environment file is empty, skipping linking an environment\"\n                        )\n\n                run.environment = artifact\n                if new_env_artifact:\n                    logger.debug(f\"saved run.environment: {run.environment}\")\n\n    # set finished_at\n    if finished_at and run is not None:\n        if not from_cli:\n            update_finished_at = True\n        else:\n            update_finished_at = run.finished_at is None\n        if update_finished_at:\n            run.finished_at = datetime.now(timezone.utc)\n\n    # track report and set is_consecutive\n    if save_source_code_and_report and not skip_save_report:\n        if run is not None:\n            # do not save a run report if executing through nbconvert\n            if report_path is not None and notebook_runner != \"nbconvert\":\n                if is_r_notebook:\n                    title_text, report_path = clean_r_notebook_html(report_path)\n                    if title_text is not None:\n                        transform.description = title_text\n                if run.report_id is not None:\n                    _, hash, _ = hash_file(report_path)  # ignore hash_type for now\n                    if hash != run.report.hash:\n                        response = input(\n                            f\"You are about to overwrite an existing report (hash '{run.report.hash}') for Run('{run.uid}'). Proceed? (y/n) \"\n                        )\n                        if response == \"y\":\n                            run.report.replace(report_path)\n                            run.report.save(upload=True, print_progress=False)\n                        else:\n                            logger.important(\"keeping old report\")\n                    else:\n                        logger.important(\"report is already saved\")\n                else:\n                    report_file = ln.Artifact(  # type: ignore\n                        report_path,\n                        description=f\"Report of run {run.uid}\",\n                        kind=\"__lamindb_run__\",  # hidden file\n                        run=False,\n                    )\n                    report_file.save(upload=True, print_progress=False)\n                    run.report = report_file\n                if is_r_notebook:\n                    # this is the \"cleaned\" report\n                    report_path.unlink()\n                logger.debug(\n                    f\"saved transform.latest_run.report: {transform.latest_run.report}\"\n                )\n            run._is_consecutive = is_consecutive\n        if report_path is not None and notebook_runner == \"nbconvert\":\n            logger.important(f\"to save the notebook html, run: lamin save {filepath}\")\n\n    # save both run & transform records if we arrive here\n    if run is not None:\n        run.save()\n    transform_id_prior_to_save = transform.id\n    transform.save()  # this in-place updates the state of transform upon hash collision\n    if transform.id != transform_id_prior_to_save:\n        # the hash existed and we're actually back to the previous version\n        # hence, this was in fact a run of the previous transform rather than of\n        # the new transform\n        # this can happen in interactively executed notebooks with a pro-active version bump in case it turns out that the user didn't make a change to the notebook\n        run.transform = transform\n        run.save()\n        ln.Transform.get(transform_id_prior_to_save).delete(permanent=True)\n\n    # finalize\n    if finished_at and not from_cli and run is not None:\n        run_time = run.finished_at - run.started_at\n        days = run_time.days\n        seconds = run_time.seconds\n        hours = seconds // 3600\n        minutes = (seconds % 3600) // 60\n        secs = seconds % 60\n        formatted_run_time = (\n            f\"{days}d\"\n            if days != 0\n            else \"\" + f\"{hours}h\"\n            if hours != 0\n            else \"\" + f\"{minutes}m\"\n            if minutes != 0\n            else \"\" + f\"{secs}s\"\n        )\n\n        logger.important(\n            f\"finished Run('{run.uid}') after {formatted_run_time} at {format_field_value(run.finished_at)}\"\n        )\n    if ln_setup.settings.instance.is_on_hub:\n        instance_slug = ln_setup.settings.instance.slug\n        if save_source_code_and_report:\n            ui_url = ln_setup.settings.instance.ui_url\n            logger.important(\n                f\"{message_prefix}: {ui_url}/{instance_slug}/transform/{transform.uid}\"\n            )\n        if finished_at and not from_cli and save_source_code_and_report:\n            thing = \"notebook\" if (is_ipynb or is_r_notebook) else \"script\"\n            logger.important(\n                f\"to update your {thing} from the CLI, run: lamin save {filepath}\"\n            )\n    if not save_source_code_and_report:\n        logger.warning(\n            f\"did *not* save source code and report -- to do so, run: lamin save {filepath}\"\n        )\n    return None\n"
  },
  {
    "path": "lamindb/_secret_redaction.py",
    "content": "from __future__ import annotations\n\nimport re\n\nREDACTED_SECRET_VALUE = \"***REDACTED***\"  # noqa: S105\nSENSITIVE_PARAM_KEY_PATTERN = re.compile(\n    r\"(^|[_\\-.])(api[_-]?key|access[_-]?key|secret|token|password|passwd|private[_-]?key|client[_-]?secret)($|[_\\-.])\"\n)\n\n# Match only quoted literals in assignments, e.g.:\n# - my_secret = \"value\"\n# - my.secret: \"value\"\n# - mySecret := \"value\"\n# We intentionally do not match unquoted RHS values to avoid false positives like\n# type annotations (`api_key: str`) or variable forwarding (`api_key=api_key`).\n_KEY_VALUE_ASSIGNMENT_PATTERN = re.compile(\n    r\"(?P<prefix>(?P<key>[A-Za-z_][A-Za-z0-9_.\\-]*)\\s*(?P<op>:=|=|:)\\s*)\"\n    r\"(?P<value>(?P<quote>['\\\"`])(?P<quoted>.*?)(?P=quote))\"\n)\n\n# Match: os.environ[\"API_KEY\"] = \"value\"\n_ENV_ASSIGNMENT_PATTERN = re.compile(\n    r\"(?P<prefix>os\\.environ\\[\\s*(?P<kquote>['\\\"])(?P<key>[^'\\\"]+)(?P=kquote)\\s*\\]\\s*=\\s*)\"\n    r\"(?P<value>(?P<quote>['\\\"`])(?P<quoted>.*?)(?P=quote))\"\n)\n\n# Match: {\"client_secret\": \"value\"}\n_QUOTED_KEY_ASSIGNMENT_PATTERN = re.compile(\n    r\"(?P<prefix>(?P<kquote>['\\\"])(?P<key>[^'\\\"]+)(?P=kquote)\\s*:\\s*)\"\n    r\"(?P<value>(?P<quote>['\\\"`])(?P<quoted>.*?)(?P=quote))\"\n)\n\n# We intentionally treat env lookups as safe/re-runnable references, not embedded secrets.\n# Examples that should remain unchanged:\n# - api_key = os.getenv(\"OPENAI_API_KEY\")\n# - api_key = getenv(\"OPENAI_API_KEY\")\n# - api_key = os.environ[\"OPENAI_API_KEY\"]\n# - api_key = os.environ.get(\"OPENAI_API_KEY\")\n_ENV_REFERENCE_VALUE_PATTERN = re.compile(\n    r\"^(os\\.getenv\\(.+\\)|getenv\\(.+\\)|os\\.environ\\[[^\\]]+\\]|os\\.environ\\.get\\(.+\\))$\"\n)\n\n# Match PostgreSQL URLs that include inline credentials:\n# - postgresql://user:password@host:5432/dbname\n# - postgres://user:password@host/dbname?sslmode=require\n_POSTGRES_CREDENTIALS_URL_PATTERN = re.compile(\n    r\"^postgres(?:ql)?://[^:@/\\s]+:[^@/\\s]+@[^/\\s]+(?:/[^\\s]*)?$\",\n    re.IGNORECASE,\n)\n\n\ndef normalize_sensitive_key_name(key: str) -> str:\n    normalized_key = re.sub(r\"([A-Z]+)([A-Z][a-z])\", r\"\\1_\\2\", key)\n    normalized_key = re.sub(r\"([a-z0-9])([A-Z])\", r\"\\1_\\2\", normalized_key).lower()\n    return normalized_key\n\n\ndef is_sensitive_param_key(key: str) -> bool:\n    return bool(SENSITIVE_PARAM_KEY_PATTERN.search(normalize_sensitive_key_name(key)))\n\n\ndef is_sensitive_param_value(value: object) -> bool:\n    if not isinstance(value, str):\n        return False\n    return bool(_POSTGRES_CREDENTIALS_URL_PATTERN.match(value.strip()))\n\n\ndef _redact_assignment_match(match: re.Match[str]) -> str:\n    key = match.group(\"key\")\n    quoted_value = match.group(\"quoted\")\n    if not is_sensitive_param_key(key) and not is_sensitive_param_value(quoted_value):\n        return match.group(0)\n    # Redact only hardcoded values, not environment-based references.\n    # This preserves reproducibility for source code that reads secrets from env vars.\n    raw_value = match.group(\"value\")\n    if _ENV_REFERENCE_VALUE_PATTERN.match(raw_value):\n        return match.group(0)\n    quote = match.group(\"quote\")\n    redacted_value = (\n        f\"{quote}{REDACTED_SECRET_VALUE}{quote}\"\n        if quote is not None\n        else REDACTED_SECRET_VALUE\n    )\n    return f\"{match.group('prefix')}{redacted_value}\"\n\n\ndef redact_secrets_in_source_code(source_code: str) -> tuple[str, int]:\n    redaction_count = 0\n\n    def replace_with_count(match: re.Match[str]) -> str:\n        nonlocal redaction_count\n        replaced = _redact_assignment_match(match)\n        if replaced != match.group(0):\n            redaction_count += 1\n        return replaced\n\n    redacted = _ENV_ASSIGNMENT_PATTERN.sub(replace_with_count, source_code)\n    redacted = _KEY_VALUE_ASSIGNMENT_PATTERN.sub(replace_with_count, redacted)\n    redacted = _QUOTED_KEY_ASSIGNMENT_PATTERN.sub(replace_with_count, redacted)\n    return redacted, redaction_count\n"
  },
  {
    "path": "lamindb/_view.py",
    "content": "from __future__ import annotations\n\nimport builtins\nimport importlib\nimport inspect\nfrom typing import TYPE_CHECKING\n\nfrom lamin_utils import colors, logger\nfrom lamindb_setup import settings\nfrom lamindb_setup._init_instance import get_schema_module_name\n\nfrom lamindb.models import Feature, JsonValue, SQLRecord\n\nfrom .models.feature import serialize_pandas_dtype\n\nif TYPE_CHECKING:\n    import pandas as pd\n\nis_run_from_ipython = getattr(builtins, \"__IPYTHON__\", False)\n\n\ndef display_df_with_descriptions(\n    df: pd.DataFrame, descriptions: dict[str, str] | None = None\n):\n    from IPython.display import HTML, display\n\n    if descriptions is None:\n        display(df)\n        return None\n\n    # Start building HTML table\n    html = '<table class=\"dataframe\">'\n\n    # Create header with title and description rows\n    html += \"<thead>\"\n\n    # Column names row\n    html += \"<tr>\"\n    html += '<th class=\"header-title index-header\"></th>'  # Index header\n    for col in df.columns:\n        html += f'<th class=\"header-title\">{col}</th>'\n    html += \"</tr>\"\n\n    # Descriptions row\n    html += \"<tr>\"\n    html += f'<th class=\"header-desc index-header\">{df.index.name or \"\"}</th>'  # Index column\n    for col in df.columns:\n        desc = descriptions.get(col, \"\")\n        html += f'<th class=\"header-desc\">{desc}</th>'\n    html += \"</tr>\"\n\n    html += \"</thead>\"\n\n    # Add body rows\n    html += \"<tbody>\"\n    for idx, row in df.iterrows():\n        html += \"<tr>\"\n        html += f'<th class=\"row-index\">{idx}</th>'  # Index value\n        for col in df.columns:\n            html += f\"<td>{row[col]}</td>\"\n        html += \"</tr>\"\n    html += \"</tbody>\"\n    html += \"</table>\"\n\n    # Add CSS styles\n    styled_html = f\"\"\"\n    <style>\n        .dataframe {{\n            border-collapse: collapse;\n            margin: 10px 0;\n        }}\n        .dataframe th, .dataframe td {{\n            border: 1px solid #ddd;\n            padding: 8px;\n            text-align: left;\n        }}\n        .header-title {{\n            font-weight: bold;\n        }}\n        .header-desc {{\n            color: #666;\n            font-weight: normal;\n        }}\n        .row-index {{\n            font-weight: bold;\n        }}\n        .index-header {{\n            font-weight: bold;\n        }}\n    </style>\n    {html}\n    \"\"\"\n    return display(HTML(styled_html))\n\n\ndef view(\n    *,\n    limit: int = 7,\n    modules: str | None = None,\n    registries: list[str] | None = None,\n    df: pd.DataFrame | None = None,\n) -> None:\n    \"\"\"View metadata.\n\n    Args:\n        limit: Display the latest `n` records\n        modules: schema module to view. Default's to\n            `None` and displays all registry modules.\n        registries: List of SQLRecord names. Defaults to\n            `None` and lists all registries.\n        df: A DataFrame to display.\n    \"\"\"\n    if df is not None:\n        descriptions = {\n            col_name: serialize_pandas_dtype(dtype)\n            for col_name, dtype in df.dtypes.to_dict().items()\n        }\n        feature_dtypes = dict(Feature.objects.values_list(\"name\", \"dtype\"))\n        descriptions.update(feature_dtypes)\n        display_df_with_descriptions(df, descriptions)\n        return None\n\n    if is_run_from_ipython:\n        from IPython.display import display as show\n    else:\n        show = logger.print\n\n    if modules is not None:\n        module_names = [modules]\n    else:\n        module_names = [\"core\"] + list(settings.instance.modules)\n\n    for module_name in module_names:\n        schema_module = importlib.import_module(get_schema_module_name(module_name))\n        # the below is necessary because a schema module might not have been\n        # explicitly accessed\n        importlib.reload(schema_module)\n\n        all_registries = {\n            registry\n            for registry in schema_module.__dict__.values()\n            if inspect.isclass(registry)\n            and issubclass(registry, SQLRecord)\n            and registry is not SQLRecord\n        }\n        if module_name == \"core\":\n            all_registries.update({JsonValue})\n        if registries is not None:\n            filtered_registries = {\n                registry\n                for registry in all_registries\n                if registry.__name__ in registries\n            }\n        else:\n            filtered_registries = all_registries\n        if len(module_names) > 1:\n            section = f\"* module: {colors.green(colors.bold(module_name))} *\"\n            section_no_color = f\"* module: {module_name} *\"\n            logger.print(\"*\" * len(section_no_color))\n            logger.print(section)\n            logger.print(\"*\" * len(section_no_color))\n        for registry in sorted(filtered_registries, key=lambda x: x.__name__):\n            df = registry.to_dataframe(limit=limit)\n            if df.shape[0] > 0:\n                logger.print(colors.blue(colors.bold(registry.__name__)))\n                show(df)\n"
  },
  {
    "path": "lamindb/base/__init__.py",
    "content": "\"\"\"Base library.\n\nIs available also when no instance is setup.\n\nModules\n-------\n\n.. autosummary::\n   :toctree: .\n\n   uids\n   types\n   fields\n   dtypes\n   utils\n\n\"\"\"\n\nfrom . import dtypes, fields, types, uids, utils\nfrom .utils import deprecated, doc_args\n\n__all__ = [\"dtypes\", \"fields\", \"types\", \"uids\", \"utils\"]\n"
  },
  {
    "path": "lamindb/base/dtypes.py",
    "content": "\"\"\"Dtype utils.\n\n.. autofunction:: check_dtype\n\n\"\"\"\n\nfrom datetime import datetime\nfrom typing import Any, Callable, Iterable\n\nimport numpy as np\n\n\ndef is_list_of_type(value: Any, expected_type: Any) -> bool:\n    \"\"\"Helper function to check if a value is either of expected_type or a list of that type, or a mix of both in a nested structure.\"\"\"\n    if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):\n        # handle nested lists recursively\n        return all(isinstance(item, expected_type) for item in value)\n    return False\n\n\ndef check_dtype(expected_type: Any, nullable: bool) -> Callable:\n    \"\"\"Creates a check function for Pandera that validates a column's dtype.\n\n    Supports both standard dtype checking and mixed list/single values for the same type.\n    For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.\n\n    Args:\n        expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')\n\n    Returns:\n        A function that checks if a series has the expected dtype or contains mixed types\n    \"\"\"\n    import pandas as pd\n\n    from lamindb.models.query_set import SQLRecordList\n\n    def check_function(series):\n        # empty series are considered valid if feature is nullable\n        # the issue is that nullable in Pandera controls whether None/NaN values are allowed in the column, not whether the column can be empty (0 rows).\n        # so \"col\": [1, 2, None, 4] is correctly handled by pandera nullable=True, but an empty column \"col\": [] is not.\n        if nullable and series.isnull().all():\n            return True\n        # first check if the series is entirely of the expected dtype (fast path)\n        if expected_type == \"int\" and pd.api.types.is_integer_dtype(series.dtype):\n            return True\n        elif expected_type == \"float\" and pd.api.types.is_float_dtype(series.dtype):\n            return True\n        elif expected_type == \"num\" and pd.api.types.is_numeric_dtype(series.dtype):\n            return True\n        elif expected_type == \"str\" and pd.api.types.is_string_dtype(series.dtype):\n            return True\n        elif expected_type == \"path\" and pd.api.types.is_string_dtype(series.dtype):\n            return True\n        elif expected_type == \"url\" and pd.api.types.is_string_dtype(series.dtype):\n            return True\n        elif expected_type == \"bool\" and pd.api.types.is_bool_dtype(series.dtype):\n            return True\n\n        # if we're here, it might be a mixed column with object dtype\n        # need to check each value individually\n        if series.dtype == \"object\" and expected_type.startswith(\"list\"):\n            expected_type_member = expected_type.replace(\"list[\", \"\").removesuffix(\"]\")\n            if expected_type_member == \"int\":\n                return series.apply(lambda x: is_list_of_type(x, int)).all()\n            elif expected_type_member == \"float\":\n                return series.apply(lambda x: is_list_of_type(x, float)).all()\n            elif expected_type_member == \"bool\":\n                return series.apply(lambda x: is_list_of_type(x, bool)).all()\n            elif expected_type_member == \"num\":\n                # for numeric, accept either int or float\n                return series.apply(lambda x: is_list_of_type(x, (int, float))).all()\n            elif (\n                expected_type_member == \"str\"\n                or expected_type_member == \"path\"\n                or expected_type_member == \"url\"\n                or expected_type_member.startswith(\"cat[\")\n            ):\n                return series.apply(lambda x: is_list_of_type(x, str)).all()\n            elif expected_type_member == \"list\":\n                return series.apply(\n                    lambda x: isinstance(x, (list, np.ndarray, SQLRecordList))\n                ).all()\n\n        # if we get here, the validation failed\n        return False\n\n    return check_function\n\n\ndef is_valid_datetime_str(date_string: str) -> bool | str:\n    try:\n        dt = datetime.fromisoformat(date_string)\n        return dt.isoformat()\n    except ValueError:\n        return False\n\n\ndef is_iterable_of_sqlrecord(value: Any):\n    from lamindb.models import SQLRecord\n\n    return isinstance(value, Iterable) and isinstance(next(iter(value)), SQLRecord)\n"
  },
  {
    "path": "lamindb/base/fields.py",
    "content": "\"\"\"Fields.\n\nDjango fields with modified default arguments.\n\n.. autoclass:: CharField\n.. autoclass:: TextField\n.. autoclass:: ForeignKey\n.. autoclass:: BooleanField\n.. autoclass:: DateField\n.. autoclass:: DateTimeField\n.. autoclass:: BigIntegerField\n.. autoclass:: IntegerField\n.. autoclass:: OneToOneField\n.. autoclass:: FloatField\n.. autoclass:: DecimalField\n.. autoclass:: BinaryField\n.. autoclass:: JSONField\n.. autoclass:: EmailField\n.. autoclass:: TimeField\n.. autoclass:: SlugField\n.. autoclass:: URLField\n.. autoclass:: UUIDField\n.. autoclass:: PositiveIntegerField\n.. autoclass:: PositiveSmallIntegerField\n.. autoclass:: SmallIntegerField\n.. autoclass:: GenericIPAddressField\n.. autoclass:: DurationField\n\"\"\"\n\nfrom django.db import models\n\n\nclass CharField(models.CharField):\n    \"\"\"Custom `CharField` with default values for `blank`, `default`, and `max_length`.\n\n    Django default values for `CharField` are `blank=False`, `default=\"\"`, undefined `max_length`.\n    \"\"\"\n\n    def __init__(self, max_length: int = 255, **kwargs):\n        kwargs[\"max_length\"] = max_length  # Set max_length in kwargs\n        kwargs.setdefault(\"blank\", True)\n        kwargs.setdefault(\"default\", None)\n        super().__init__(**kwargs)  # Pass all arguments as kwargs\n\n\nclass TextField(models.TextField):\n    \"\"\"Custom `TextField` with default values for `blank` and `default`.\n\n    Django default values for `TextField` are `blank=False`, `default=''`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        kwargs.setdefault(\"default\", None)\n        super().__init__(*args, **kwargs)\n\n\nclass ForeignKey(models.ForeignKey):\n    \"\"\"Custom `ForeignKey` with default values for `blank`.\n\n    Django default value for `ForeignKey` `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\n# fix doc string that otherwise errors\nForeignKey.get_extra_descriptor_filter.__doc__ = (\n    ForeignKey.get_extra_descriptor_filter.__doc__.replace(\n        \".filter(**kwargs)\", \"`.filter(**kwargs)`\"\n    )\n)\n\n\nclass BooleanField(models.BooleanField):\n    \"\"\"Custom `BooleanField` with default values for `blank` and `default`.\n\n    Django default values for `BooleanField` are `blank=False`, `default=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        kwargs.setdefault(\"default\", None)\n        super().__init__(*args, **kwargs)\n\n\nclass DateField(models.DateField):\n    \"\"\"Custom `DateField` with default values for `blank`.\n\n    Django default values for `DateField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass DateTimeField(models.DateTimeField):\n    \"\"\"Custom `DateTimeField` with default values for `blank`.\n\n    Django default values for `DateTimeField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass BigIntegerField(models.BigIntegerField):\n    \"\"\"Custom `BigIntegerField` with default values for `blank`.\n\n    Django default values for `BigIntegerField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        kwargs.setdefault(\"default\", None)\n        super().__init__(*args, **kwargs)\n\n\nclass IntegerField(models.IntegerField):\n    \"\"\"Custom `IntegerField` with default values for `blank`.\n\n    Django default values for `IntegerField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass OneToOneField(models.OneToOneField):\n    \"\"\"Custom `OneToOneField` with default values for `blank`.\n\n    Django default values for `OneToOneField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass FloatField(models.FloatField):\n    \"\"\"Custom `FloatField` with default values for `blank`.\n\n    Django default values for `FloatField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass DecimalField(models.DecimalField):\n    \"\"\"Custom `DecimalField` with default values for `blank`.\n\n    Django default values for `DecimalField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass JSONField(models.JSONField):\n    \"\"\"Custom `JSONField` with default values for `blank`.\n\n    Django default values for `JSONField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass DurationField(models.DurationField):\n    \"\"\"Custom `DurationField` with default values for `blank`.\n\n    Django default values for `DurationField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass URLField(models.URLField):\n    \"\"\"Custom `URLField` with default values for `blank`.\n\n    Django default values for `URLField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass EmailField(models.EmailField):\n    \"\"\"Custom `EmailField` with default values for `blank`.\n\n    Django default values for `EmailField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass TimeField(models.TimeField):\n    \"\"\"Custom `TimeField` with default values for `blank`.\n\n    Django default values for `TimeField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass SlugField(models.SlugField):\n    \"\"\"Custom `SlugField` with default values for `blank`.\n\n    Django default values for `SlugField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass UUIDField(models.UUIDField):\n    \"\"\"Custom `UUIDField` with default values for `blank`.\n\n    Django default values for `UUIDField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass PositiveIntegerField(models.PositiveIntegerField):\n    \"\"\"Custom `PositiveIntegerField` with default values for `blank`.\n\n    Django default values for `PositiveIntegerField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass PositiveSmallIntegerField(models.PositiveSmallIntegerField):\n    \"\"\"Custom `PositiveSmallIntegerField` with default values for `blank`.\n\n    Django default values for `PositiveSmallIntegerField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass SmallIntegerField(models.SmallIntegerField):\n    \"\"\"Custom `SmallIntegerField` with default values for `blank`.\n\n    Django default values for `SmallIntegerField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass BinaryField(models.BinaryField):\n    \"\"\"Custom `BinaryField` with default values for `blank`.\n\n    Django default values for `BinaryField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n\n\nclass GenericIPAddressField(models.GenericIPAddressField):\n    \"\"\"Custom `GenericIPAddressField` with default values for `blank`.\n\n    Django default values for `GenericIPAddressField` are `blank=False`.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        kwargs.setdefault(\"blank\", True)\n        super().__init__(*args, **kwargs)\n"
  },
  {
    "path": "lamindb/base/ids.py",
    "content": "from .uids import *  # noqa: F403\n"
  },
  {
    "path": "lamindb/base/types.py",
    "content": "\"\"\"Base types.\n\nCentral object types\n--------------------\n\n.. autoclass:: ArtifactKind\n.. autoclass:: TransformKind\n.. autoclass:: BlockKind\n.. autoclass:: BranchStatus\n.. autoclass:: RunStatus\n.. autoclass:: DtypeStr\n\nBasic types\n-----------\n\n.. autoclass:: AnyPathStr\n.. autoclass:: StrField\n.. autoclass:: ListLike\n.. autoclass:: FieldAttr\n\"\"\"\n\nfrom __future__ import annotations\n\nimport datetime\nfrom typing import TYPE_CHECKING, Literal, Union\n\nimport numpy as np\nfrom django.db.models.query_utils import DeferredAttribute as FieldAttr\nfrom lamindb_setup.types import AnyPathStr  # noqa: F401\n\nif TYPE_CHECKING:\n    import pandas as pd\n\n# need to use Union because __future__.annotations doesn't do the job here <3.10\n# typing.TypeAlias, >3.10 on but already deprecated\n# pd.Series as string to avoid importing pandas at runtime\nListLike = Union[list[str], \"pd.Series\", np.ndarray]\nStrField = Union[str, FieldAttr]  # typing.TypeAlias\n\nTransformKind = Literal[\"pipeline\", \"notebook\", \"script\", \"function\"]\nTransformType = TransformKind  # backward compat\nArtifactKind = Literal[\n    \"dataset\", \"model\", \"plan\", \"__lamindb_run__\", \"__lamindb_config__\"\n]\nBlockKind = Literal[\"readme\", \"comment\"]\n\"\"\"Block kind, a `README.md`-type page or comment.\n\nAny block expects Markdown as the formatting language.\n\"\"\"\n\nBranchStatus = Literal[\"standalone\", \"draft\", \"review\", \"merged\", \"closed\"]\n\"\"\"Branch status.\n\n=============  =====  ==================================================\nstatus         code   description\n=============  =====  ==================================================\n`closed`       -2     Change Request was closed without merging.\n`merged`       -1     The branch was merged into another branch.\n`standalone`   0      A standalone branch without Change Request.\n`draft`        1      Change Request exists but is not ready for review.\n`review`       2      Change Request is ready for review.\n=============  =====  ==================================================\n\nThe database stores the branch status as an integer code in field `_status_code`.\n\"\"\"\n\nRunStatus = Literal[\n    \"scheduled\", \"restarted\", \"started\", \"completed\", \"errored\", \"aborted\"\n]\n\"\"\"Run status.\n\n===========  =====  ===========================\nstatus       code   description\n===========  =====  ===========================\n`scheduled`  -3     The run is scheduled.\n`restarted`  -2     The run was restarted.\n`started`    -1     The run has started.\n`completed`  0      The run completed successfully.\n`errored`    1      The run ended with an error.\n`aborted`    2      The run was aborted.\n===========  =====  ===========================\n\nThe database stores the run status as an integer code in field `_status_code`.\n\"\"\"\n\nRUN_STATUS_TO_CODE: dict[RunStatus, int] = {\n    \"scheduled\": -3,\n    \"restarted\": -2,\n    \"started\": -1,\n    \"completed\": 0,\n    \"errored\": 1,\n    \"aborted\": 2,\n}\nRUN_CODE_TO_STATUS: dict[int, RunStatus] = {\n    code: status for status, code in RUN_STATUS_TO_CODE.items()\n}\n\nBRANCH_STATUS_TO_CODE: dict[BranchStatus, int] = {\n    \"closed\": -2,\n    \"merged\": -1,\n    \"standalone\": 0,\n    \"draft\": 1,\n    \"review\": 2,\n}\nBRANCH_CODE_TO_STATUS: dict[int, BranchStatus] = {\n    code: status for status, code in BRANCH_STATUS_TO_CODE.items()\n}\n\nDtypeObject = int | float | str | bool | datetime.date | datetime.datetime | dict\n\nDtypeStr = Literal[\n    \"num\",  # numericals\n    \"int\",  # integer / numpy.integer\n    \"float\",  # float\n    \"str\",  # string\n    \"bool\",  # boolean\n    \"datetime\",  # datetime\n    \"date\",  # date\n    \"dict\",  # dictionary\n    \"path\",  # path, validated as str, but specially treated in the UI\n    \"url\",  # URL, validated as str, but specially treated in the UI\n    \"object\",  # this is a pandas input dtype, we're only using it for complicated types, not for strings; consciously currently not documented\n]\n\"\"\"String-serialized representations of common data types.\n\n============  ============  =================================================\ndescription   lamindb       pandas\n============  ============  =================================================\nnumerical     `\"num\"`       `int | float`\ninteger       `\"int\"`       `int64 | int32 | int16 | int8 | uint | ...`\nfloat         `\"float\"`     `float64 | float32 | float16 | float8 | ...`\nstring        `\"str\"`       `object`\nboolean       `\"bool\"`      `boolean | bool`\ndatetime      `\"datetime\"`  `datetime`\ndate          `\"date\"`      `object` (pandera requires an ISO-format string, convert with `df[\"date\"] = df[\"date\"].dt.date`)\ndictionary    `\"dict\"`      `object`\npath          `\"path\"`      `str` (pandas does not have a dedicated path type, validated as `str`)\nurl           `\"url\"`       `str` (pandas does not have a dedicated url type, validated as `str`)\n============  ============  =================================================\n\n.. admonition:: Categorical and relational data types\n\n    These are **not** contained in the `DTypeStr` `Literal`.\n\n    For any categorical, you can restrict the permissible values to the values defined in a registry.\n    When serializing this to a string, then `'cat[ULabel]'` or `'cat[bionty.CellType]'` indicate that permissible values are stored in the `name` field of the `ULabel` or `CellType` registry, respectively.\n    You can also restrict to sub-types defined in registries via the `type` field, e.g., `'cat[ULabel[123456ABCDEFG]]'` indicates that values must be of the type with `uid=\"123456ABCDEFG\"` within the `ULabel` registry.\n\n    In LaminDB, categoricals define relationships with registries. See :class:`~lamindb.Feature` for more details.\n\n\"\"\"\nDtype = DtypeStr  # backward compat\n\nRegistryId = Literal[\n    \"__lamindb_artifact__\",\n    \"__lamindb_block__\",\n    \"__lamindb_collection__\",\n    \"__lamindb_feature__\",\n    \"__lamindb_jsonvalue__\",\n    \"__lamindb_project__\",\n    \"__lamindb_record__\",\n    \"__lamindb_run__\",\n    \"__lamindb_schema__\",\n    \"__lamindb_storage__\",\n    \"__lamindb_transform__\",\n    \"__lamindb_ulabel__\",\n]\n"
  },
  {
    "path": "lamindb/base/uids.py",
    "content": "\"\"\"Universal IDs.\n\nBase generators\n===============\n\n.. autofunction:: base26\n.. autofunction:: base62\n.. autofunction:: base64\n\nUID generators\n================\n\n.. autofunction:: base62_8\n.. autofunction:: base62_12\n.. autofunction:: base62_16\n.. autofunction:: base62_20\n\nCollision probabilities\n=======================\n\n8 base62 characters (`62**8=2e+14`):\n\n======= ===========\nn       p_collision\n======= ===========\n100k    2e-05\n1M      2e-03\n======= ===========\n\n12 base62 characters (`62**12=3e+21`):\n\n======= ===========\nn       p_collision\n======= ===========\n100M    2e-06\n1B      2e-04\n======= ===========\n\n16 base62 characters (`62**16=5e+28`):\n\n======= ===========\nn       p_collision\n======= ===========\n1e12    7e-05\n1e13    7e-03\n======= ===========\n\n20 base62 characters (`62**20=7e+35`) roughly matches UUID (`2**122=5e+36`):\n\n======= ===========\nn       p_collision\n======= ===========\n1e16    7e-05\n1e17    7e-03\n======= ===========\n\nSee `source <https://lamin.ai/laminlabs/lamindata/transform/t2xCdMB9v5wL>`__.\n\n\"\"\"\n\nimport secrets\nimport string\n\n\ndef base64(n_char: int) -> str:\n    \"\"\"Random Base64 string.\"\"\"\n    alphabet = string.digits + string.ascii_letters.swapcase() + \"_\" + \"-\"\n    uid = \"\".join(secrets.choice(alphabet) for i in range(n_char))\n    return uid\n\n\ndef base62(n_char: int) -> str:\n    \"\"\"Random Base62 string.\"\"\"\n    alphabet = string.digits + string.ascii_letters.swapcase()\n    uid = \"\".join(secrets.choice(alphabet) for i in range(n_char))\n    return uid\n\n\ndef base26(n_char: int):\n    \"\"\"ASCII lowercase.\"\"\"\n    alphabet = string.ascii_lowercase\n    uid = \"\".join(secrets.choice(alphabet) for i in range(n_char))\n    return uid\n\n\ndef base62_4() -> str:\n    return base62(4)\n\n\ndef base62_8() -> str:\n    \"\"\"Random Base62 string of length 8.\"\"\"\n    return base62(8)\n\n\ndef base62_12() -> str:\n    \"\"\"Random Base62 string of length 12.\"\"\"\n    return base62(12)\n\n\ndef base62_16() -> str:\n    \"\"\"Random Base62 string of length 16.\"\"\"\n    return base62(16)\n\n\ndef base62_20() -> str:\n    \"\"\"Random Base62 string of length 20.\"\"\"\n    return base62(20)\n\n\ndef base62_24() -> str:\n    \"\"\"Random Base62 string of length 24.\"\"\"\n    return base62(24)\n"
  },
  {
    "path": "lamindb/base/users.py",
    "content": "user_id_cache = {}\n\n\ndef _user_has_write_access() -> bool:\n    from django.db import connection\n\n    with connection.cursor() as cursor:\n        cursor.execute(\"\"\"\n            SELECT EXISTS (\n                SELECT 1 FROM check_access() chk\n                WHERE chk.role in ('write', 'admin')\n            )\n        \"\"\")\n        return cursor.fetchone()[0]\n\n\ndef current_user_id() -> int:\n    import lamindb_setup as ln_setup\n    from lamindb_setup import settings\n    from lamindb_setup._init_instance import register_user\n\n    from lamindb.errors import NoWriteAccess\n    from lamindb.models import User\n\n    def query_user_id():\n        if ln_setup.core.django.IS_MIGRATING:\n            return 1\n        else:\n            user = settings.user\n            user_uid = user.uid\n            try:\n                user_id = User.objects.get(uid=user_uid).id\n            except User.DoesNotExist:\n                register_user(user)\n                try:\n                    user_id = User.objects.get(uid=user_uid).id\n                except User.DoesNotExist as e:\n                    isettings = settings.instance\n                    if isettings.is_read_only_connection:\n                        raise NoWriteAccess(\n                            \"Unable to register a new user in the instance database \"\n                            \"because you have a read-only connection.\"\n                        ) from e\n                    if (\n                        isettings._db_permissions == \"jwt\"\n                        and not _user_has_write_access()\n                    ):\n                        raise NoWriteAccess(\n                            \"Unable to register a new user in the instance database \"\n                            \"because you don't have write access to any space or registry.\"\n                        ) from e\n                    raise e\n            return user_id\n\n    if settings._instance_exists:\n        slug = settings.instance.slug\n        if slug not in user_id_cache:\n            user_id_cache[slug] = query_user_id()\n        return user_id_cache[slug]\n    else:\n        return query_user_id()\n"
  },
  {
    "path": "lamindb/base/utils.py",
    "content": "\"\"\"Utilities.\n\n.. autodecorator:: doc_args\n.. autodecorator:: deprecated\n.. autodecorator:: class_and_instance_method\n.. autodecorator:: strict_classmethod\n\n\"\"\"\n\nfrom functools import wraps\nfrom types import MethodType\n\nfrom lamindb_setup.core import deprecated, doc_args\n\n\nclass class_and_instance_method:\n    \"\"\"Decorator to define a method that works both as class and instance method.\"\"\"\n\n    def __init__(self, func):\n        self.func = func\n        wraps(func)(self)\n\n    def __get__(self, instance, owner):\n        if instance is None:\n            # Called on the class\n            return MethodType(self.func, owner)\n        else:\n            # Called on an instance\n            return MethodType(self.func, instance)\n\n\nclass strict_classmethod:\n    \"\"\"Decorator for a classmethod that raises an error when called on an instance.\"\"\"\n\n    def __init__(self, func):\n        self.func = func\n        wraps(func)(self)\n\n    def __get__(self, instance, owner):\n        if instance is not None:\n            # Called on an instance - raise immediately\n            raise TypeError(\n                f\"{owner.__name__}.{self.func.__name__}() is a class method and must be called on the {owner.__name__} class, not on a {owner.__name__} object\"\n            )\n\n        # Called on the class - return bound method using MethodType\n        return MethodType(self.func, owner)\n\n\n__all__ = [\n    \"doc_args\",\n    \"deprecated\",\n    \"class_and_instance_method\",\n    \"strict_classmethod\",\n]\n"
  },
  {
    "path": "lamindb/core/__init__.py",
    "content": "\"\"\"Core library.\n\nSettings & context:\n\n.. autosummary::\n   :toctree: .\n\n   Settings\n   subsettings\n   Context\n\nArtifact loaders:\n\n.. autosummary::\n   :toctree: .\n\n   loaders\n\nData loaders:\n\n.. autosummary::\n   :toctree: .\n\n   MappedCollection\n\nModules:\n\n.. autosummary::\n   :toctree: .\n\n   storage\n   logger\n\n\"\"\"\n\nfrom lamin_utils import logger\nfrom lamin_utils._inspect import InspectResult\n\nfrom .. import errors as exceptions  # backward compat\nfrom ..base import types  # backward compat\nfrom ..examples import datasets  # backward compat\nfrom . import subsettings\nfrom ._context import Context\nfrom ._settings import Settings\n\n\ndef __getattr__(name: str):\n    # need to lazy import a few auxliary modules to maintain backward compatibility\n    # none of them should have been eagerly imported in the first place\n    import importlib\n\n    if name == \"loaders\":\n        loaders = importlib.import_module(\".loaders\", package=__name__)\n        globals()[name] = loaders\n        return loaders\n    if name == \"storage\":\n        storage = importlib.import_module(\".storage\", package=__name__)\n        globals()[name] = storage\n        return storage\n    if name == \"MappedCollection\":\n        from ._mapped_collection import MappedCollection\n\n        globals()[name] = MappedCollection\n        return MappedCollection\n    raise AttributeError(f\"module {__name__!r} has no attribute {name!r}\")\n"
  },
  {
    "path": "lamindb/core/_compat.py",
    "content": "import importlib.util\nfrom typing import Any, Callable, TypeVar\n\nT = TypeVar(\"T\")\n\n\ndef is_package_installed(package_name: str) -> bool:\n    spec = importlib.util.find_spec(package_name)\n    return spec is not None\n\n\ndef with_package(package_name: str, operation: Callable[[Any], T]) -> T:\n    \"\"\"Execute an operation that requires a specific package.\n\n    Args:\n        package_name: Package name (e.g., \"mudata\")\n        operation: Function that takes the imported module and returns a result\n\n    Examples:\n        # For direct package functions\n        result = with_package(\"mudata\", lambda mod: mod.read_zarr(path))\n    \"\"\"\n    try:\n        module = importlib.import_module(package_name)\n    except ImportError:\n        raise ImportError(\n            f\"Package '{package_name}' is required but not installed. \"\n            f\"Please install with: pip install {package_name}\"\n        ) from None\n    return operation(module)\n\n\ndef with_package_obj(\n    obj: Any, class_name: str, package_name: str, operation: Callable[[Any], T]\n) -> tuple[bool, T | None]:\n    \"\"\"Handle operations on objects that require specific packages.\n\n    Args:\n        obj: The object to operate on\n        class_name: Expected class name (e.g., \"MuData\")\n        package_name: Package that provides the class (e.g., \"mudata\")\n        operation: Function to call with the object if package is available.\n\n    Examples:\n        # For instance methods\n        handled, res = apply_class_func(dmem, \"MuData\", \"mudata\",\n                                      lambda obj: obj.write(filepath))\n    \"\"\"\n    if obj.__class__.__name__ == class_name:\n        try:\n            importlib.import_module(package_name)\n        except ImportError:\n            raise ImportError(\n                f\"Object appears to be {class_name} but '{package_name}' package is not installed. \"\n                f\"Please install with: pip install {package_name}\"\n            ) from None\n        result = operation(obj)\n        return True, result\n\n    return False, None\n"
  },
  {
    "path": "lamindb/core/_context.py",
    "content": "from __future__ import annotations\n\nimport builtins\nimport hashlib\nimport os\nimport signal\nimport sys\nimport threading\nimport traceback\nfrom datetime import datetime, timezone\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, Callable, TextIO\n\nimport lamindb_setup as ln_setup\nfrom django.db.models import Func, IntegerField, Q\nfrom lamin_utils._logger import logger\nfrom lamindb_setup.core.hashing import hash_file, hash_string\n\nfrom .._secret_redaction import (\n    REDACTED_SECRET_VALUE,\n    is_sensitive_param_key,\n    is_sensitive_param_value,\n    redact_secrets_in_source_code,\n)\nfrom ..base.uids import base62_12\nfrom ..errors import InvalidArgument, TrackNotCalled, UpdateContext\nfrom ..models import Run, SQLRecord, Transform, format_field_value\nfrom ..models._feature_manager import infer_convert_dtype_key_value\nfrom ..models._is_versioned import bump_version as bump_version_function\nfrom ..models._is_versioned import (\n    increment_base62,\n)\nfrom ._settings import settings\nfrom ._sync_git import get_transform_reference_from_git_repo\nfrom ._track_environment import track_python_environment\n\nif TYPE_CHECKING:\n    from types import FrameType, TracebackType\n\n    from lamindb.base.types import TransformKind\n    from lamindb.models import Artifact, Branch, Project, Space\n\n\nis_run_from_ipython = getattr(builtins, \"__IPYTHON__\", False)\n\nmsg_path_failed = \"failed to infer notebook path.\\nfix: pass `path` to `ln.track()`\"\n\n\ndef get_key_from_module(caller_module: str) -> str:\n    if \".\" in caller_module:\n        key_from_module = f\"pypackages/{caller_module.replace('.', '/')}.py\"\n    else:\n        key_from_module = None\n    return key_from_module\n\n\ndef detect_and_process_source_code_file(\n    *,\n    path: str | Path | None,\n    transform_kind: TransformKind | None = None,\n) -> tuple[Path, TransformKind, str, str, str | None]:\n    \"\"\"Track source code file and determine transform metadata.\n\n    For `.py` files, classified as \"script\".\n    For `.Rmd` and `.qmd` files, classified as \"notebook\" because they\n    typically come with an .html run report.\n\n    Package vs script criterion: source code is part of a **package** if the\n    caller's module name contains at least one `.` (module nesting goes beyond\n    the filename). Otherwise it is a **script** (module nesting stops at the\n    filename, e.g. `__main__`, `__mp_main__`, or a single top-level name).\n\n    Args:\n        path: Path to the source code file. If None, infers from call stack.\n\n    Returns:\n        Tuple of (path, transform_kind, reference, reference_type, key_from_module).\n        - path: Path object to the source file\n        - transform_kind: \"script\" or \"notebook\"\n        - reference: Git reference URL if sync_git_repo is set, else None\n        - reference_type: \"url\" if reference exists, else None\n        - key_from_module: If caller is part of a package (`.` in __name__),\n          `pypackages/module/path/to/file.py`; else None (key will be computed from dev_dir or path.name).\n\n    Raises:\n        NotImplementedError: If path cannot be determined from call stack.\n    \"\"\"\n    # for `.py` files, classified as \"script\"\n    # for `.Rmd` and `.qmd` files, which we classify\n    # as \"notebook\" because they typically come with an .html run report\n    key_from_module: str | None = None\n    if path is None:\n        import inspect\n\n        frame = inspect.stack()[2]\n        path_str = frame[1]\n        if not path_str or path_str.startswith(\"<\"):\n            raise NotImplementedError(\n                \"Cannot determine valid file path, pass manually via path (interactive sessions not yet supported)\"\n            )\n        path = Path(path_str)\n        # package vs script: nesting beyond filename makes the file part of a python package\n        caller_module = frame[0].f_globals.get(\"__name__\", \"__main__\")\n        key_from_module = get_key_from_module(caller_module)\n    else:\n        path = Path(path)\n    # for Rmd and qmd, we could also extract the title\n    # we don't do this for now as we're setting the title upon `ln.finish()` or `lamin save`\n    # by extracting it from the html while cleaning it: see clean_r_notebook_html()\n    # also see the script_to_notebook() in the CLI _load.py where the title is extracted\n    # from the source code YAML and updated with the transform description\n    # note that ipynb notebooks are handled in a separate function (_track_notebook())\n    if transform_kind is None:\n        transform_kind = \"notebook\" if path.suffix in {\".Rmd\", \".qmd\"} else \"script\"\n    reference = None\n    reference_type = None\n    if settings.sync_git_repo is not None and path.suffix != \".ipynb\":\n        reference = get_transform_reference_from_git_repo(path)\n        reference_type = \"url\"\n    return path, transform_kind, reference, reference_type, key_from_module\n\n\ndef get_uid_ext(version: str) -> str:\n    from lamin_utils._base62 import encodebytes\n\n    # merely zero-padding the nbproject version such that the base62 encoding is\n    # at least 4 characters long doesn't yields sufficiently diverse hashes and\n    # leads to collisions; it'd be nice because the uid_ext would be ordered\n    return encodebytes(hashlib.md5(version.encode()).digest())[:4]  # noqa: S324\n\n\ndef get_notebook_path() -> tuple[Path, str]:\n    from nbproject.dev._jupyter_communicate import (\n        notebook_path as get_notebook_path,\n    )\n\n    path = None\n    try:\n        path, env = get_notebook_path(return_env=True)\n    except ValueError as ve:\n        raise ve\n    except Exception as error:\n        raise RuntimeError(msg_path_failed) from error\n    if path is None:\n        raise RuntimeError(msg_path_failed) from None\n    return Path(path), env\n\n\n# from https://stackoverflow.com/questions/61901628\ndef get_notebook_key_colab() -> str:\n    from socket import gethostbyname, gethostname  # type: ignore\n\n    from requests import get  # type: ignore\n\n    ip = gethostbyname(gethostname())  # 172.28.0.12\n    try:\n        key = get(f\"http://{ip}:9000/api/sessions\").json()[0][\"name\"]  # noqa: S113\n        key = f\"colab/{key}\"\n    except Exception:\n        logger.warning(\n            \"could not get notebook key from Google Colab, using: colab/notebook.ipynb\"\n        )\n        key = \"colab/notebook.ipynb\"\n    return key\n\n\ndef get_cli_call() -> tuple[str, str] | None:\n    \"\"\"Returns (tool_name, args) when invoked as a script with CLI arguments.\n\n    Returns None if not run as a script (e.g., in Jupyter, interactive shell)\n    or when no arguments were passed.\n    \"\"\"\n    if len(sys.argv) > 1 and sys.argv[0] and not is_run_from_ipython:\n        return Path(sys.argv[0]).name, \" \".join(sys.argv[1:])\n    return None\n\n\ndef pretty_pypackages(dependencies: dict) -> str:\n    deps_list = []\n    for pkg, ver in dependencies.items():\n        if ver != \"\":\n            deps_list.append(pkg + f\"=={ver}\")\n        else:\n            deps_list.append(pkg)\n    deps_list.sort()\n    return \" \".join(deps_list)\n\n\ndef last_non_empty_r_block(line: str) -> str:\n    for block in reversed(line.split(\"\\r\")):\n        if block:\n            return block\n    return \"\"\n\n\nclass LogStreamHandler:\n    def __init__(self, log_stream: TextIO, file: TextIO, use_buffer: bool):\n        self.log_stream = log_stream\n        self.file = file\n\n        self._buffer = \"\"\n        self._use_buffer = use_buffer\n\n    def write(self, data: str) -> int:\n        data_length = len(data)\n\n        self.log_stream.write(data)\n        if self.file.closed:\n            return data_length\n\n        if not self._use_buffer:\n            self.file.write(data)\n            self.file.flush()\n            return data_length\n\n        self._buffer += data\n        # write only the last part of a line with carriage returns\n        while \"\\n\" in self._buffer:\n            if self.file.closed:\n                return data_length\n            line, self._buffer = self._buffer.split(\"\\n\", 1)\n            self.file.write(last_non_empty_r_block(line) + \"\\n\")\n            self.file.flush()\n\n        return data_length\n\n    def flush(self):\n        self.log_stream.flush()\n        if not self.file.closed:\n            self.file.flush()\n\n    # https://laminlabs.slack.com/archives/C07DB677JF6/p1759423901926139\n    # other tracking frameworks like W&B use our output stream and expect\n    # certain functions like isatty to be available\n    def isatty(self) -> bool:\n        return False\n\n    # .flush is sometimes (in jupyter etc.) called after every .write\n    # this needs to be called only at the end\n    def flush_buffer(self):\n        if not self.file.closed and self._buffer:\n            self.file.write(last_non_empty_r_block(self._buffer))\n            self._buffer = \"\"\n        self.flush()\n\n\nclass LogStreamTracker:\n    def __init__(self):\n        self.original_stdout = None\n        self.original_stderr = None\n        self.log_file = None\n        self.is_cleaning_up = False\n        self.original_excepthook: Callable[\n            [type[BaseException], BaseException, TracebackType | None], Any\n        ] = sys.excepthook\n\n        self.original_signal_handlers: dict[\n            signal.Signals, Callable[[int, FrameType | None], Any] | int\n        ] = {}\n        if threading.current_thread() == threading.main_thread():\n            self.original_signal_handlers[signal.SIGTERM] = signal.getsignal(\n                signal.SIGTERM\n            )\n            self.original_signal_handlers[signal.SIGINT] = signal.getsignal(\n                signal.SIGINT\n            )\n\n    def start(self, run: Run):\n        self.original_stdout = sys.stdout\n        self.original_stderr = sys.stderr\n        self.run = run\n        self.log_file_path = (\n            ln_setup.settings.cache_dir / f\"run_logs_{self.run.uid}.txt\"\n        )\n        self.log_file = open(self.log_file_path, \"w\", encoding=\"utf-8\")\n        # the instance that's connected is important information\n        self.log_file.write(\n            f\"\\x1b[92m→\\x1b[0m connected lamindb: {ln_setup.settings.instance.slug}\\n\"\n        )\n        # use buffering for correct handling of carriage returns\n        sys.stdout = LogStreamHandler(\n            self.original_stdout, self.log_file, use_buffer=True\n        )\n        # write evrything immediately in stderr\n        sys.stderr = LogStreamHandler(\n            self.original_stderr, self.log_file, use_buffer=False\n        )\n        # handle signals\n        # signal should be used only in the main thread, otherwise\n        # ValueError: signal only works in main thread of the main interpreter\n        if threading.current_thread() == threading.main_thread():\n            signal.signal(signal.SIGTERM, self.cleanup)\n            signal.signal(signal.SIGINT, self.cleanup)\n        # handle exceptions\n        sys.excepthook = self.handle_exception\n        # reset handler for lamin logger because sys.stdout has been replaced\n        logger.set_handler()\n\n    def finish(self):\n        if self.original_stdout:\n            getattr(sys.stdout, \"flush_buffer\", sys.stdout.flush)()\n            sys.stderr.flush()\n            sys.stdout = self.original_stdout\n            sys.stderr = self.original_stderr\n            if not self.log_file.closed:\n                self.log_file.close()\n            # reset handler for lamin logger because sys.stdout has been replaced\n            logger.set_handler()\n\n    def cleanup(self, signo=None, frame=None):\n        try:\n            from .._finish import save_run_logs\n\n            if self.original_stdout and not self.is_cleaning_up:\n                self.is_cleaning_up = True\n                if signo is not None:\n                    if self.log_file.closed:\n                        self.log_file = open(self.log_file_path, \"a\", encoding=\"utf-8\")\n                    getattr(sys.stdout, \"flush_buffer\", sys.stdout.flush)()\n                    sys.stderr.flush()\n                    signal_msg = f\"\\nProcess terminated by signal {signo} ({signal.Signals(signo).name})\\n\"\n                    if frame:\n                        signal_msg += (\n                            f\"Frame info:\\n{''.join(traceback.format_stack(frame))}\"\n                        )\n                    self.log_file.write(signal_msg)\n                    self.log_file.flush()\n                    self.run._status_code = 2  # aborted\n                else:\n                    self.run._status_code = 1  # errored\n                self.run.finished_at = datetime.now(timezone.utc)\n                sys.stdout = self.original_stdout\n                sys.stderr = self.original_stderr\n                if not self.log_file.closed:\n                    self.log_file.close()\n                save_run_logs(self.run, save_run=True)\n                # reset handler for lamin logger because sys.stdout has been replaced\n                logger.set_handler()\n        except:  # noqa: E722, S110\n            pass\n        finally:\n            if signo is not None and signo in self.original_signal_handlers:\n                original_handler = self.original_signal_handlers[signo]\n                if callable(original_handler):\n                    original_handler(signo, frame)\n\n    def handle_exception(self, exc_type, exc_value, exc_traceback):\n        try:\n            if self.original_stdout and not self.is_cleaning_up:\n                if self.log_file.closed:\n                    self.log_file = open(self.log_file_path, \"a\", encoding=\"utf-8\")\n                getattr(sys.stdout, \"flush_buffer\", sys.stdout.flush)()\n                sys.stderr.flush()\n                error_msg = f\"{''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))}\"\n                self.log_file.write(error_msg)\n                self.log_file.flush()\n                self.cleanup()\n        except:  # noqa: E722, S110\n            pass\n        finally:\n            self.original_excepthook(exc_type, exc_value, exc_traceback)\n\n\ndef serialize_params_to_json(params: dict) -> dict:\n    serialized_params = {}\n    for key, value in params.items():\n        # None and empty list are missing/empty values, skip them consistent with elsewhere in the code\n        if value is None or (isinstance(value, list) and len(value) == 0):\n            continue\n        dtype, converted_value, _ = infer_convert_dtype_key_value(key, value, mute=True)\n        # converted_value is not JSON if dtype is a SQLRecord or a list of SQLRecords\n        # because we just the above function for features where we'd like to keep SQLRecords as they are\n        # so, need to handle this here\n        if (\n            dtype == \"?\" or dtype.startswith(\"cat\") or dtype.startswith(\"list[cat\")\n        ) and dtype not in {\"cat ? str\", \"list[cat ? str]\"}:\n            if isinstance(value, SQLRecord):\n                serialized_params[key] = (\n                    f\"{value.__class__.__get_name_with_module__()}[{value.uid}]\"\n                )\n            elif dtype.startswith(\"list[cat\"):\n                items = list(value)\n                if items and all(isinstance(item, SQLRecord) for item in items):\n                    serialized_params[key] = [  # type: ignore\n                        f\"{item.__class__.__get_name_with_module__()}[{item.uid}]\"\n                        for item in items\n                    ]\n        else:\n            serialized_params[key] = converted_value\n        if key not in serialized_params:\n            logger.warning(\n                f\"skipping param {key} with value {value} and dtype {dtype} not JSON serializable\"\n            )\n            continue\n        if is_sensitive_param_key(key) or is_sensitive_param_value(\n            serialized_params[key]\n        ):\n            serialized_params[key] = REDACTED_SECRET_VALUE\n    return serialized_params\n\n\nclass Context:\n    \"\"\"Run context.\n\n    Is the book keeper for :func:`~lamindb.track` and :func:`~lamindb.finish`.\n    \"\"\"\n\n    def __init__(self, uid: str | None = None, path: Path | None = None):\n        self._uid: str | None = uid\n        self._path: Path | None = path\n        self._description: str | None = None\n        self._version: str | None = None\n        self._transform: Transform | None = None\n        self._run: Run | None = None\n        self._project: Project | None = None\n        self._space: Space | None = None\n        self._branch: Branch | None = None\n        self._logging_message_track: str = \"\"\n        self._logging_message_imports: str = \"\"\n        self._stream_tracker: LogStreamTracker = LogStreamTracker()\n        self._is_finish_retry: bool = False\n        self._notebook_runner: str | None = None\n        self._is_step_decorator_run: bool = False\n\n    @property\n    def transform(self) -> Transform | None:\n        \"\"\"Managed transform of context.\"\"\"\n        return self._transform\n\n    @property\n    def description(self) -> str | None:\n        \"\"\"`description` argument for `context.transform`.\"\"\"\n        return self._description\n\n    @description.setter\n    def description(self, value: str | None):\n        self._description = value\n\n    @property\n    def uid(self) -> str | None:\n        \"\"\"`uid` argument for `context.transform`.\"\"\"\n        return self._uid\n\n    @uid.setter\n    def uid(self, value: str | None):\n        self._uid = value\n\n    @property\n    def version(self) -> str | None:\n        \"\"\"`version` argument for `context.transform`.\"\"\"\n        return self._version\n\n    @version.setter\n    def version(self, value: str | None):\n        self._version = value\n\n    @property\n    def project(self) -> Project | None:\n        \"\"\"Project to label entities created during the run.\"\"\"\n        return self._project\n\n    @property\n    def space(self) -> Space | None:\n        \"\"\"The space in which artifacts, collections, transforms, and runs are saved during the run.\"\"\"\n        return self._space\n\n    @property\n    def branch(self) -> Branch | None:\n        \"\"\"The branch on which entities are created during the run.\"\"\"\n        return self._branch\n\n    @property\n    def run(self) -> Run | None:\n        \"\"\"Managed run of context.\"\"\"\n        return self._run\n\n    def _track(\n        self,\n        transform: str | Transform | None = None,\n        *,\n        project: str | Project | None = None,\n        space: str | Space | None = None,\n        branch: str | Branch | None = None,\n        plan: str | Artifact | None = None,\n        features: dict | None = None,\n        params: dict | None = None,\n        new_run: bool | None = None,\n        pypackages: bool | None = None,\n        key: str | None = None,\n        path: str | Path | None = None,\n        source_code: str | None = None,\n        kind: TransformKind | None = None,\n        entrypoint: str | None = None,\n        initiated_by_run: Run | str | None = None,\n        stream_tracking: bool | None = None,\n    ) -> None:\n        \"\"\"Track a run of a notebook or script.\n\n        Populates the global run :class:`~lamindb.context` with :class:`~lamindb.Transform` & :class:`~lamindb.Run` objects and tracks the compute environment.\n\n        Args:\n            transform: A transform (stem) `uid` or object. If `None`, auto-creates a `transform` with its `uid`.\n            project: A project or its `name` or `uid` for labeling entities created during the run.\n            space: A restricted space or its `name` or `uid` in which to store entities created during the run.\n                Default: the `\"all\"` space. Note that bionty entities ignore this setting and always get written to the `\"all\"` space.\n                If you want to manually move entities to a different space, set the `.space` field (:doc:`docs:permissions`).\n            branch: A branch (or its `name` or `uid`) on which to store records.\n            plan: A plan, typically an agent plan. Pass an artifact (or its `key` or `uid`).\n            features: A dictionary of features & values to track for the run.\n            params: A dictionary of params & values to track for the run.\n            new_run: If `False`, loads the latest run of transform\n                (default notebook), if `True`, creates new run (default non-notebook).\n            pypackages: If `True` or `None`, infers Python packages used in a notebook.\n            key: Transform key.\n            path: Filepath of a notebook or script.\n            source_code: Source code.\n            kind: Transform kind.\n            entrypoint: Optional entrypoint name (e.g. function qualname) for the run.\n            initiated_by_run: Optional parent run (or its `uid`) that triggered this run.\n                If `None`, falls back to the `LAMIN_INITIATED_BY_RUN_UID` environment variable when set.\n            stream_tracking: If set, override whether to capture stdout/stderr to run logs.\n                Used by the flow/step decorator: flows get logs (`True`), steps do not (`False`).\n\n        Examples:\n\n            To track the run of a notebook or script:\n\n            .. literalinclude:: scripts/run_track_and_finish.py\n               :language: python\n\n            To ensure one version history across file renames::\n\n                ln.track(\"Onv04I53OgtT\")\n\n            To track a project or an agent plan: pass a project/artifact to `ln.track()`, for example::\n\n                ln.track(project=\"My project\", plan=\"./plans/curate-dataset-x.md\")\n\n            Note that you have to create a project or save the agent plan in case it they don't yet exist::\n\n                # create a project in Python\n                ln.Project(name=\"My project\").save()\n\n                # create a project with the CLI\n                lamin create project \"My project\"\n\n                # save an agent plan with the CLI\n                lamin save /path/to/.cursor/plans/curate-dataset-x.plan.md\n                lamin save /path/to/.claude/plans/curate-dataset-x.md\n\n            To sync code with a git repo, see: :ref:`sync-code-with-git`.\n\n            To track parameters and features, see: :ref:`track-run-parameters`.\n\n            To browse more examples, see: :doc:`/track`.\n        \"\"\"\n        from lamindb.models import Artifact, Branch, Project, Space\n\n        from .._finish import (\n            save_context_core,\n        )\n\n        # similar logic here: https://github.com/laminlabs/lamindb/pull/2527\n        if ln_setup.settings.instance.is_read_only_connection:\n            logger.warning(\"skipping track(), connected in read-only mode\")\n            return None\n        if project is None:\n            project = os.environ.get(\"LAMIN_CURRENT_PROJECT\")\n        if project is not None:\n            if isinstance(project, Project):\n                assert project._state.adding is False, (  # noqa: S101\n                    \"Project must be saved before passing it to track()\"\n                )\n                project_record = project\n            else:\n                project_record = Project.filter(\n                    Q(name=project) | Q(uid=project)\n                ).one_or_none()\n                if project_record is None:\n                    raise InvalidArgument(\n                        f\"Project '{project}' not found, either create it with `ln.Project(name='...').save()` or fix typos.\"\n                    )\n            self._project = project_record\n        if space is not None:\n            if isinstance(space, Space):\n                assert space._state.adding is False, (  # noqa: S101\n                    \"Space must be saved before passing it to track()\"\n                )\n                space_record = space\n            else:\n                space_record = Space.filter(Q(name=space) | Q(uid=space)).one_or_none()\n                if space_record is None:\n                    raise InvalidArgument(\n                        f\"Space '{space}', please check on the hub UI whether you have the correct `uid` or `name`.\"\n                    )\n            self._space = space_record\n        if branch is not None:\n            if isinstance(branch, Branch):\n                assert branch._state.adding is False, (  # noqa: S101\n                    \"Branch must be saved before passing it to track()\"\n                )\n                branch_record = branch\n            else:\n                branch_record = Branch.filter(\n                    Q(name=branch) | Q(uid=branch)\n                ).one_or_none()\n                if branch_record is None:\n                    raise InvalidArgument(\n                        f\"Space '{branch}', please check on the hub UI whether you have the correct `uid` or `name`.\"\n                    )\n            self._branch = branch_record\n        plan_record: Artifact | None = None\n        if plan is not None:\n            if isinstance(plan, Artifact):\n                assert plan._state.adding is False, (  # noqa: S101\n                    \"Plan artifact must be saved before passing it to track()\"\n                )\n                plan_record = plan\n            else:\n                plan_record = Artifact.filter(Q(key=plan) | Q(uid=plan)).one_or_none()\n                if plan_record is None:\n                    raise InvalidArgument(\n                        f\"Plan artifact '{plan}' not found, either create it or use a valid key/uid.\"\n                    )\n        if initiated_by_run is None:\n            initiated_by_run = os.environ.get(\"LAMIN_INITIATED_BY_RUN_UID\")\n        initiated_by_run_record: Run | None = None\n        if initiated_by_run is not None:\n            if isinstance(initiated_by_run, Run):\n                assert initiated_by_run._state.adding is False, (  # noqa: S101\n                    \"initiated_by_run must be saved before passing it to track()\"\n                )\n                initiated_by_run_record = initiated_by_run\n            else:\n                initiated_by_run_record = Run.filter(uid=initiated_by_run).one_or_none()\n                if initiated_by_run_record is None:\n                    raise InvalidArgument(\n                        f\"Run '{initiated_by_run}' not found, please pass a valid run uid.\"\n                    )\n        self._logging_message_track = \"\"\n        self._logging_message_imports = \"\"\n        self._is_step_decorator_run = (\n            entrypoint is not None and stream_tracking is False\n        )\n        if transform is not None and isinstance(transform, str):\n            self.uid = transform\n            transform = None\n            uid_was_none = False\n        else:\n            uid_was_none = True\n        self._path = None\n        cli_call = get_cli_call()\n        if transform is None:\n            description = None\n            transform_ref = None\n            transform_ref_type = None\n            if source_code is not None:\n                transform_kind = kind if kind is not None else \"function\"\n                assert key is not None, (\n                    \"`key` cannot be `None` when `source_code` is passed to `track()`.\"\n                )\n                assert path is None, (\n                    \"`path` cannot be passed when `source_code` is passed to `track()`.\"\n                )\n            else:\n                if is_run_from_ipython:\n                    self._path, description = self._track_notebook(\n                        path_str=path, pypackages=pypackages\n                    )\n                    transform_kind = \"notebook\"\n                else:\n                    (\n                        self._path,\n                        transform_kind,\n                        transform_ref,\n                        transform_ref_type,\n                        key_from_module,\n                    ) = detect_and_process_source_code_file(path=path)\n                    if key is None and key_from_module is not None:\n                        key = key_from_module\n            if description is None:\n                description = self._description\n            if description is None and cli_call is not None:\n                description = f\"CLI: {cli_call[0]}\"\n            self._create_or_load_transform(\n                description=description,\n                transform_ref=transform_ref,\n                transform_ref_type=transform_ref_type,\n                transform_kind=transform_kind,\n                key=key,\n                source_code=source_code,\n            )\n        else:\n            if transform.kind in {\"notebook\", \"script\"}:\n                raise ValueError(\n                    \"Use `ln.track()` without passing transform in a notebook or script\"\n                    \" - metadata is automatically parsed\"\n                )\n            transform_exists = None\n            if transform.id is not None:\n                # transform has an id but unclear whether already saved\n                transform_exists = Transform.filter(id=transform.id).first()\n            if transform_exists is None:\n                transform.save()\n                self._logging_message_track += (\n                    f\"created Transform('{transform.uid}', key='{transform.key}')\"\n                )\n                transform_exists = transform\n            else:\n                self._logging_message_track += (\n                    f\"loaded Transform('{transform.uid}', key='{transform.key}')\"\n                )\n            self._transform = transform_exists\n\n        if new_run is None:  # for notebooks, default to loading latest runs\n            new_run = (\n                False\n                if (\n                    self._transform.kind == \"notebook\"\n                    and self._notebook_runner != \"nbconvert\"\n                )\n                else True\n            )  # type: ignore\n\n        run = None\n        if not new_run:  # try loading latest run by same user\n            run = (\n                Run.filter(\n                    transform=self._transform, created_by_id=ln_setup.settings.user.id\n                )\n                .order_by(\"-created_at\")\n                .first()\n            )\n            if run is not None:  # loaded latest run\n                run.started_at = datetime.now(timezone.utc)  # update run time\n                run._status_code = -2  # re-started\n                if plan_record is not None:\n                    run.plan = plan_record\n                    run.save()\n                entrypoint_str = (\n                    f\", entrypoint='{entrypoint}'\" if entrypoint is not None else \"\"\n                )\n                self._logging_message_track += f\", re-started Run('{run.uid}'{entrypoint_str}) at {format_field_value(run.started_at)}\"\n\n        if run is None:  # create new run\n            run = Run(transform=self._transform, plan=plan_record)\n            if entrypoint is not None:\n                run.entrypoint = entrypoint\n            if initiated_by_run_record is not None:\n                run.initiated_by_run = initiated_by_run_record\n            run.started_at = datetime.now(timezone.utc)\n            run._status_code = -1  # started\n            entrypoint_str = (\n                f\", entrypoint='{entrypoint}'\" if entrypoint is not None else \"\"\n            )\n            self._logging_message_track += f\", started new Run('{run.uid}'{entrypoint_str}) at {format_field_value(run.started_at)}\"\n        # can only determine at ln.finish() if run was consecutive in\n        # interactive session, otherwise, is consecutive\n        run.is_consecutive = True if is_run_from_ipython else None\n        if params is not None:\n            run.params = serialize_params_to_json(params)\n            self._logging_message_track += \"\\n→ params: \" + \", \".join(\n                f\"{key}={value!r}\" for key, value in run.params.items()\n            )\n        if cli_call is not None:\n            _, cli_args = cli_call\n            logger.important(f\"script invoked with: {cli_args}\")\n            run.cli_args = cli_args\n        run.save()  # need to save now\n        if features is not None:\n            run.features.add_values(features)\n            self._logging_message_track += \"\\n→ features: \" + \", \".join(\n                f\"{key}={value!r}\" for key, value in features.items()\n            )\n        self._run = run\n        track_python_environment(run)\n        if self.project is not None:\n            # to update a potential project link\n            # is only necessary if transform is loaded rather than newly created\n            # can be optimized by checking whether the transform is loaded, but it typically is\n            self.transform.save()\n        log_to_file = None\n        if log_to_file is None:\n            if stream_tracking is not None:\n                log_to_file = stream_tracking\n            else:\n                # Script runs get stream tracking; decorator-based runs only when\n                # stream_tracking is passed (flow=True from decorator).\n                log_to_file = self.transform.kind == \"script\"\n        if log_to_file:\n            self._stream_tracker.start(run)\n        logger.important(self._logging_message_track)\n        if self._logging_message_imports:\n            logger.important(self._logging_message_imports)\n        if uid_was_none and self._path is not None:\n            # Flow/step decorators set run.entrypoint. Show this recommendation only\n            # for flows (`stream_tracking=True`) and suppress it for steps.\n            if entrypoint is not None:\n                if stream_tracking:\n                    logger.important_hint(\n                        f'recommendation: to identify the script across renames, pass the uid: @ln.flow(uid=\"{self.transform.uid[:-4]}\")'\n                    )\n            else:\n                notebook_or_script = (\n                    \"notebook\" if self._transform.kind == \"notebook\" else \"script\"\n                )\n                r_or_python = \".\" if self._path.suffix in {\".py\", \".ipynb\"} else \"$\"\n                project_str = (\n                    f', project=\"{project if isinstance(project, str) else project.name}\"'\n                    if project is not None\n                    else \"\"\n                )\n                space_str = (\n                    f', space=\"{space if isinstance(space, str) else space.name}\"'\n                    if space is not None\n                    else \"\"\n                )\n                plan_str = (\n                    f', plan=\"{plan if isinstance(plan, str) else plan.key}\"'\n                    if plan is not None\n                    else \"\"\n                )\n                params_str = (\n                    \", params={...}\" if params is not None else \"\"\n                )  # do not put the values because typically parameterized by user\n                kwargs_str = f\"{project_str}{space_str}{plan_str}{params_str}\"\n                logger.important_hint(\n                    f'recommendation: to identify the {notebook_or_script} across renames, pass the uid: ln{r_or_python}track(\"{self.transform.uid[:-4]}\"{kwargs_str})'\n                )\n        if (\n            self.transform.kind == \"script\"\n            and self._path is not None\n            and not self._is_step_decorator_run\n        ):\n            save_context_core(\n                run=run,\n                transform=self.transform,\n                filepath=self._path,\n                message_prefix=\"monitor at\",\n            )\n\n    def _track_notebook(\n        self,\n        *,\n        path_str: str | Path | None,\n        pypackages: bool | None = None,\n    ) -> tuple[Path, str | None]:\n        if path_str is None:\n            path, self._notebook_runner = get_notebook_path()\n        else:\n            path = Path(path_str)\n        if pypackages is None:\n            pypackages = True\n        description = None\n        if path.suffix == \".ipynb\" and path.stem.startswith(\"Untitled\"):\n            raise RuntimeError(\n                \"Your notebook file name is 'Untitled.ipynb', please rename it before tracking. You might have to re-start your notebook kernel.\"\n            )\n        path_str = path.as_posix()\n        if path_str.startswith(\"/fileId=\"):\n            logger.warning(\"tracking on Google Colab is experimental\")\n            path_str = get_notebook_key_colab()\n            path = Path(path_str)\n        else:\n            from nbproject.dev import read_notebook\n            from nbproject.dev._meta_live import get_title\n            from nbproject.dev._pypackage import infer_pypackages\n\n            try:\n                nb = read_notebook(path_str)\n\n                nbproject_title = get_title(nb)\n                if nbproject_title is not None:\n                    description = nbproject_title\n\n                if pypackages:\n                    self._logging_message_imports += (\n                        \"notebook imports:\"\n                        f\" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}\"\n                    )\n            except Exception:\n                logger.debug(\"reading the notebook file failed\")\n                pass\n        return path, description\n\n    def _process_aux_transform(\n        self,\n        aux_transform: Transform,\n        transform_hash: str,\n    ) -> tuple[str, Transform | None, str]:\n        # first part of the if condition: no version bump, second part: version bump\n        message = \"\"\n        if (\n            # if a user hasn't yet saved the transform source code AND is the same user\n            (\n                aux_transform.source_code is None\n                and aux_transform.created_by_id == ln_setup.settings.user.id\n            )\n            # if the transform source code is unchanged\n            # if aux_transform.kind == \"notebook\", we anticipate the user makes changes to the notebook source code\n            # in an interactive session, hence we *pro-actively bump* the version number by setting `revises` / 'nbconvert' execution is NOT interactive\n            # in the second part of the if condition even though the source code is unchanged at point of running track()\n            or (\n                aux_transform.hash == transform_hash\n                and (\n                    aux_transform.kind != \"notebook\"\n                    or self._notebook_runner == \"nbconvert\"\n                )\n            )\n        ):\n            uid = aux_transform.uid\n            return uid, aux_transform, message\n        else:\n            uid = f\"{aux_transform.uid[:-4]}{increment_base62(aux_transform.uid[-4:])}\"\n            message = (\n                f\"found {aux_transform.kind} {aux_transform.key}, making new version\"\n            )\n            if (\n                aux_transform.hash == transform_hash\n                and aux_transform.kind == \"notebook\"\n            ):\n                message += \" -- anticipating changes\"\n            elif aux_transform.hash != transform_hash:\n                message += (\n                    \"\"  # could log \"source code changed\", but this seems too much\n                )\n            elif aux_transform.created_by_id != ln_setup.settings.user.id:\n                message += (\n                    f\" -- {aux_transform.created_by.handle} already works on this draft\"\n                )\n            return uid, None, message\n\n    def _create_or_load_transform(\n        self,\n        *,\n        description: str | None = None,\n        transform_ref: str | None = None,\n        transform_ref_type: str | None = None,\n        transform_kind: TransformKind = None,\n        key: str | None = None,\n        source_code: str | None = None,\n    ):\n        source_code_to_store = source_code\n        if source_code is not None:\n            source_code_to_store, redaction_count = redact_secrets_in_source_code(\n                source_code\n            )\n            if redaction_count > 0:\n                logger.warning(\n                    f\"redacted {redaction_count} secret-looking assignment(s) before persisting transform source code\"\n                )\n            transform_hash = hash_string(source_code)\n        else:\n            from .._finish import notebook_to_script\n\n            if not self._path.suffix == \".ipynb\":\n                _, transform_hash, _ = hash_file(self._path)\n            else:\n                # need to convert to stripped py:percent format for hashing\n                source_code_path = (\n                    ln_setup.settings.cache_dir\n                    / self._path.name.replace(\".ipynb\", \".py\")\n                )\n                if (\n                    self._path.exists()\n                ):  # notebook kernel might be running on a different machine\n                    notebook_to_script(description, self._path, source_code_path)\n                    _, transform_hash, _ = hash_file(source_code_path)\n                else:\n                    logger.debug(\n                        \"skipping notebook hash comparison, notebook kernel running on a different machine\"\n                    )\n                    transform_hash = None\n\n        # see whether we find a transform with the exact same hash\n        if transform_hash is not None:\n            aux_transform = Transform.filter(hash=transform_hash).first()\n        else:\n            aux_transform = None\n\n        # determine the transform key (only when path-based; key is required when source_code)\n        if key is None:\n            if ln_setup.settings.dev_dir is not None:\n                try:\n                    key = self._path.relative_to(ln_setup.settings.dev_dir).as_posix()\n                except ValueError as e:\n                    if \"subpath\" in str(e):\n                        logger.warning(\n                            f\"Path {self._path} is not within the configured dev directory \"\n                            f\"({ln_setup.settings.dev_dir}), falling back to using filename as transform key \"\n                            f\"('{self._path.name}').\"\n                        )\n                        key = self._path.name\n                    else:\n                        raise\n            else:\n                key = self._path.name\n        # if the user did not pass a uid and there is no matching aux_transform\n        # need to search for the transform based on the key\n        if self.uid is None and aux_transform is None:\n\n            class SlashCount(Func):\n                template = \"LENGTH(%(expressions)s) - LENGTH(REPLACE(%(expressions)s, '/', ''))\"\n                output_field = IntegerField()\n\n            # we need to traverse from greater depth to shorter depth so that we match better matches first\n            transforms = (\n                Transform.filter(key__endswith=key, is_latest=True)\n                .annotate(slash_count=SlashCount(\"key\"))\n                .order_by(\"-slash_count\")\n            )\n            uid = f\"{base62_12()}0000\"\n            target_transform = None\n            if len(transforms) != 0:\n                message = \"\"\n                found_key = False\n                if self._path is not None:\n                    for aux_transform in transforms:\n                        # check whether the transform key is in the path\n                        # that's not going to be the case for keys that have \"/\" in them and don't match the folder\n                        if aux_transform.key in self._path.as_posix():\n                            key = aux_transform.key\n                            uid, target_transform, message = (\n                                self._process_aux_transform(\n                                    aux_transform, transform_hash\n                                )\n                            )\n                            found_key = True\n                            break\n                if not found_key:\n                    plural_s = \"s\" if len(transforms) > 1 else \"\"\n                    transforms_str = \"\\n\".join(\n                        [\n                            f\"    {transform.uid} → {transform.key}\"\n                            for transform in transforms\n                        ]\n                    )\n                    message = f\"ignoring transform{plural_s} with same filename in different folder:\\n{transforms_str}\"\n                if message != \"\":\n                    logger.important(message)\n            self.uid, transform = uid, target_transform\n        # the user did pass the uid\n        elif self.uid is not None and len(self.uid) == 16:\n            transform = Transform.filter(uid=self.uid).one_or_none()\n        else:\n            if self.uid is not None:\n                # the case with length 16 is covered above\n                if not len(self.uid) == 12:\n                    raise InvalidArgument(\n                        f'Please pass an auto-generated uid instead of \"{self.uid}\". Resolve by running: ln.track(\"{base62_12()}\")'\n                    )\n                aux_transform = (\n                    Transform.filter(uid__startswith=self.uid)\n                    .order_by(\"-created_at\")\n                    .first()\n                )\n            else:\n                # deal with a hash-based match\n                # the user might have a made a copy of the notebook or script\n                # and actually wants to create a new transform\n                if aux_transform is not None and not aux_transform.key.endswith(key):\n                    prompt = f\"Found transform with same hash but different key: {aux_transform.key}. Did you rename your {transform_kind} to {key} (1) or intentionally made a copy (2)?\"\n                    response = (\n                        \"1\" if os.getenv(\"LAMIN_TESTING\") == \"true\" else input(prompt)\n                    )\n                    assert response in {\"1\", \"2\"}, (  # noqa: S101\n                        f\"Please respond with either 1 or 2, not {response}\"\n                    )\n                    if response == \"2\":\n                        aux_transform, transform_hash = (\n                            None,\n                            None,\n                        )  # make a new transform\n            if aux_transform is not None:\n                uid, target_transform, message = self._process_aux_transform(\n                    aux_transform, transform_hash\n                )\n                if message != \"\":\n                    logger.important(message)\n            else:\n                uid = f\"{self.uid}0000\" if self.uid is not None else None\n                target_transform = None\n            self.uid, transform = uid, target_transform\n        if self.version is not None:\n            # test inconsistent version passed\n            if (\n                transform is not None\n                and transform.version_tag is not None  # type: ignore\n                and self.version != transform.version_tag  # type: ignore\n            ):\n                raise ValueError(\n                    f\"Transform is already tagged with version {transform.version_tag}, but you passed {self.version}\\n\"  # noqa: S608\n                    f\"If you want to update the transform version, set it outside ln.track(): transform.version_tag = '{self.version}'; transform.save()\"\n                )\n            # test whether version was already used for another member of the family\n            if self.uid is not None and len(self.uid) == 16:\n                suid, vuid = (self.uid[:-4], self.uid[-4:])\n                transform = Transform.filter(\n                    uid__startswith=suid, version_tag=self.version\n                ).one_or_none()\n                if transform is not None and vuid != transform.uid[-4:]:\n                    better_version = bump_version_function(self.version)\n                    raise SystemExit(\n                        f\"✗ version '{self.version}' is already taken by Transform('{transform.uid}'); please set another version, e.g., ln.context.version = '{better_version}'\"\n                    )\n        # make a new transform record\n        if transform is None:\n            assert key is not None  # noqa: S101\n            transform = Transform(  # type: ignore\n                uid=self.uid,\n                version_tag=self.version,\n                description=description,\n                key=key,\n                reference=transform_ref,\n                reference_type=transform_ref_type,\n                kind=transform_kind,\n                source_code=source_code_to_store,\n                skip_hash_lookup=source_code is not None,\n            )\n            if source_code is not None:\n                transform.hash = transform_hash\n            transform = transform.save()\n            self._logging_message_track += (\n                f\"created Transform('{transform.uid}', key='{transform.key}')\"\n            )\n        else:\n            uid = transform.uid\n            # transform was already saved via `finish()`\n            transform_was_saved = transform.source_code is not None\n            # check whether the transform.key is consistent\n            if transform.key != key:\n                self._logging_message_track += (\n                    f\"renaming transform {transform.key} to {key}\"\n                )\n                transform.key = key\n                transform.save()\n            elif transform.description != description and description is not None:\n                transform.description = description\n                transform.save()\n                self._logging_message_track += (\n                    \"updated transform description, \"  # white space on purpose\n                )\n            elif (\n                transform.created_by_id != ln_setup.settings.user.id\n                and not transform_was_saved\n            ):\n                raise UpdateContext(\n                    f'{transform.created_by.name} ({transform.created_by.handle}) already works on this draft {transform.kind}.\\n\\nPlease create a revision via `ln.track(\"{uid[:-4]}{increment_base62(uid[-4:])}\")` or a new transform with a *different* key and `ln.track(\"{base62_12()}0000\")`.'\n                )\n            if transform.reference != transform_ref:\n                transform.reference = transform_ref\n                transform.reference_type = transform_ref_type\n                transform.save()\n                self._logging_message_track += (\n                    \"updated transform reference, \"  # white space on purpose\n                )\n            # check whether transform source code was already saved\n            if transform_was_saved:\n                bump_revision = False\n                if (\n                    transform.kind == \"notebook\"\n                    and self._notebook_runner != \"nbconvert\"\n                ):\n                    # we anticipate the user makes changes to the notebook source code\n                    # in an interactive session, hence we pro-actively bump the version number\n                    bump_revision = True\n                else:\n                    if transform_hash != transform.hash:\n                        bump_revision = True\n                    else:\n                        self._logging_message_track += f\"loaded Transform('{transform.uid}', key='{transform.key}')\"\n                if bump_revision:\n                    change_type = (\n                        \"re-running notebook with already-saved source code\"\n                        if (\n                            transform.kind == \"notebook\"\n                            and self._notebook_runner != \"nbconvert\"\n                        )\n                        else \"source code changed\"\n                    )\n                    raise UpdateContext(\n                        f'✗ {change_type}, please update the `uid` argument in `track()` to \"{uid[:-4]}{increment_base62(uid[-4:])}\"'\n                    )\n            else:\n                self._logging_message_track += (\n                    f\"loaded Transform('{transform.uid}', key='{transform.key}')\"\n                )\n        self._transform = transform\n\n    def _finish(self, ignore_non_consecutive: None | bool = None) -> None:\n        \"\"\"Finish the run of a notebook or script.\n\n        - writes a timestamp: `run.finished_at`\n        - saves the source code if it is not yet saved: `transform.source_code`\n        - saves a run report: `run.report`\n\n        When called in a notebook, will prompt to save the notebook in your editor.\n\n        Args:\n            ignore_non_consecutive: Whether to ignore if a notebook was non-consecutively executed.\n\n        Examples:\n\n            See :doc:`/track`.\n\n        See Also:\n            `lamin save script.py` or `lamin save notebook.ipynb` → `docs </cli#lamin-save>`__\n\n        \"\"\"\n        from .._finish import save_context_core, save_run_logs\n\n        if self.run is None:\n            raise TrackNotCalled(\"Please run `ln.track()` before `ln.finish()`\")\n        if self._path is None:\n            if self.run.transform.kind in {\"script\", \"notebook\"}:\n                raise ValueError(\n                    \"Transform type is not allowed to be 'script' or 'notebook' because `context._path` is `None`.\"\n                )\n            self.run.finished_at = datetime.now(timezone.utc)\n            self.run.save()\n            # reset context so the next _track() starts clean (e.g. from decorator)\n            self._uid = None\n            self._run = None\n            self._transform = None\n            self._version = None\n            self._description = None\n            self._is_step_decorator_run = False\n            return None\n        self.run._status_code = 0\n        if self.transform.kind == \"notebook\":\n            return_code = save_context_core(\n                run=self.run,\n                transform=self.run.transform,\n                filepath=self._path,\n                finished_at=True,\n                ignore_non_consecutive=ignore_non_consecutive,\n                is_retry=self._is_finish_retry,\n                notebook_runner=self._notebook_runner,\n            )\n            if return_code == \"retry\":\n                self._is_finish_retry = True\n                return None\n        else:\n            self.run.finished_at = datetime.now(timezone.utc)\n            self.run.save()  # persist finished_at (save_run_logs only saves when log file exists)\n            if ln_setup.settings.instance.is_on_hub and not self._is_step_decorator_run:\n                instance_slug = ln_setup.settings.instance.slug\n                ui_url = ln_setup.settings.instance.ui_url\n                logger.important(\n                    f\"go to: {ui_url}/{instance_slug}/transform/{self.transform.uid}\"\n                )\n            save_run_logs(self.run, save_run=True)\n            self._stream_tracker.finish()\n        # reset the context attributes so that somebody who runs `track()` after finish\n        # starts fresh\n        self._uid = None\n        self._run = None\n        self._transform = None\n        self._version = None\n        self._description = None\n        self._is_step_decorator_run = False\n\n\ncontext: Context = Context()\n"
  },
  {
    "path": "lamindb/core/_functions.py",
    "content": "import functools\nimport inspect\nfrom contextvars import ContextVar\nfrom datetime import datetime, timezone\nfrom pathlib import Path\nfrom typing import Callable, Literal, ParamSpec, TypeVar\n\nfrom lamindb.base import deprecated\n\nfrom ..models import Run\nfrom ._context import Context, get_key_from_module\nfrom ._context import context as global_context\n\nP = ParamSpec(\"P\")\nR = TypeVar(\"R\")\n\n# Create a context variable to store the current tracked run\ncurrent_tracked_run: ContextVar[Run | None] = ContextVar(\n    \"current_tracked_run\", default=None\n)\n\n\ndef get_current_tracked_run() -> Run | None:\n    \"\"\"Get the run object.\"\"\"\n    run = current_tracked_run.get()\n    if run is None:\n        run = global_context.run\n    return run\n\n\ndef _create_tracked_decorator(\n    uid: str | None = None,\n    is_flow: bool = True,\n    global_run: Literal[\"memorize\", \"clear\", \"none\"] = \"none\",\n    track_arg_aliases: bool = False,\n) -> Callable[[Callable[P, R]], Callable[P, R]]:\n    \"\"\"Internal helper to create tracked decorators.\n\n    Args:\n        uid: Persist the uid to identify this transform across renames.\n        is_flow: Triggered through @ln.flow(), otherwise @ln.step().\n    \"\"\"\n\n    def decorator_tracked(func: Callable[P, R]) -> Callable[P, R]:\n        # Get the original signature\n        sig = inspect.signature(func)\n\n        @functools.wraps(func)\n        def wrapper_tracked(*args: P.args, **kwargs: P.kwargs) -> R:\n            if global_context.run is None:\n                if not is_flow:\n                    raise RuntimeError(\n                        \"Please track the global run context before using @ln.step(): ln.track() or @ln.flow()\"\n                    )\n            else:\n                if is_flow:\n                    raise RuntimeError(\n                        \"Please use @ln.step() or clear the global run context before using @ln.flow(): no `ln.track()` or `@ln.flow(global_run='clear')`\"\n                    )\n            bound_args = sig.bind(*args, **kwargs)\n            bound_args.apply_defaults()\n            params = dict(bound_args.arguments)\n\n            initiated_by_run = get_current_tracked_run()\n            track_kwargs: dict = {}\n            if track_arg_aliases:\n                for key in (\"project\", \"space\", \"branch\", \"plan\", \"initiated_by_run\"):\n                    if key in params and params[key] is not None:\n                        track_kwargs[key] = params[key]\n                if \"initiated_by_run\" in track_kwargs:\n                    initiated_by_run = track_kwargs[\"initiated_by_run\"]\n            path_raw = inspect.getsourcefile(func)\n            path = None\n            # do not pass path when function is defined in an ipython cell\n            if path_raw is not None and Path(path_raw).exists():\n                path = Path(path_raw)\n            source_code = inspect.getsource(func) if path is None else None\n            transform_kind: Literal[\"function\", \"script\"] = (\n                \"function\" if path is None else \"script\"\n            )\n            caller_module = func.__module__\n            key = get_key_from_module(caller_module)\n            if (\n                key is None\n                and path is None\n                and caller_module in {\"__main__\", \"__mp_main__\"}\n            ):\n                key = f\"{initiated_by_run.transform.key}\"\n            context = Context(uid=uid, path=path)\n            context._track(\n                uid,\n                path=path,\n                key=key,\n                source_code=source_code,\n                kind=transform_kind,\n                entrypoint=func.__qualname__,\n                params=params,\n                new_run=True,\n                project=track_kwargs.get(\"project\"),\n                space=track_kwargs.get(\"space\"),\n                branch=track_kwargs.get(\"branch\"),\n                plan=track_kwargs.get(\"plan\"),\n                initiated_by_run=initiated_by_run,\n                stream_tracking=is_flow,\n            )\n            token = current_tracked_run.set(context.run)\n            if global_run in {\"memorize\", \"clear\"}:\n                global_context._run = context.run\n            try:\n                result = func(*args, **kwargs)\n                context._finish()\n                return result\n            except Exception as e:\n                run = context.run\n                run.finished_at = datetime.now(timezone.utc)\n                run._status_code = 1  # errored\n                run.save()\n                raise e\n            finally:\n                if (\n                    global_run == \"clear\"\n                    and global_context.run == current_tracked_run.get()\n                ):\n                    global_context._run = None\n                current_tracked_run.reset(token)\n\n        return wrapper_tracked\n\n    return decorator_tracked\n\n\ndef flow(\n    uid: str | None = None,\n    global_run: Literal[\"memorize\", \"clear\", \"none\"] = \"clear\",\n    track_arg_aliases: bool = True,\n) -> Callable[[Callable[P, R]], Callable[P, R]]:\n    \"\"\"Use `@flow()` to track a function as a workflow.\n\n    You will be able to see inputs, outputs, and parameters of the function in the data lineage graph.\n\n    The decorator creates a :class:`~lamindb.Transform` with kind `\"script\"` that maps onto the file in\n    which the function is defined.\n    The function maps onto an entrypoint of the `transform`.\n    A function execution creates a :class:`~lamindb.Run` object that stores the function name in `run.entrypoint`.\n    If the function is defined in a notebook cell or another ephemeral context, the transform is created with kind `\"function\"`.\n\n    By default `@ln.flow()`, like `ln.track()`, creates a global run context that can be accessed with `ln.context.run`.\n\n    Args:\n        uid: Persist the uid to identify a transform across renames.\n        global_run: If `\"clear\"`, set the global run context `ln.context.run` and clear after the function completes.\n            If `\"memorize\"`, set the global run context and do not clear after the function completes.\n            Set this to `\"none\"` if you want to track concurrent executions of a `flow()` in the same Python process.\n        track_arg_aliases: If `True` (default), maps function arguments with names `project`, `space`, `branch`,\n            `plan`, and `initiated_by_run` to matching `ln.track()` arguments while also keeping them in `run.params`\n            for reproducibility. Pass `False` to disable this mapping.\n\n    Examples:\n\n        To sync a workflow with a file in a git repo, see: :ref:`sync-code-with-git`.\n\n        For an extensive guide, see: :ref:`manage-workflows`. Here follow some examples.\n\n        .. literalinclude:: scripts/my_workflow.py\n            :language: python\n            :caption: my_workflow.py\n\n        .. literalinclude:: scripts/my_workflow_with_step.py\n            :language: python\n            :caption: my_workflow_with_step.py\n\n        .. literalinclude:: scripts/my_workflow_with_click.py\n            :language: python\n            :caption: my_workflow_with_click.py\n\n\n    \"\"\"\n    return _create_tracked_decorator(\n        uid=uid,\n        is_flow=True,\n        global_run=global_run,\n        track_arg_aliases=track_arg_aliases,\n    )\n\n\ndef step(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]]:\n    \"\"\"Use `@step()` to track a function as a step.\n\n    Behaves like :func:`~lamindb.flow()`, but acts as a step in a workflow and does\n    not create a global run context.\n    It errors if no initiating run (either global or local run context) exists.\n\n    See :func:`~lamindb.flow()` for examples.\n\n    Args:\n        uid: Persist the uid to identify a transform across renames.\n    \"\"\"\n    return _create_tracked_decorator(uid=uid, is_flow=False)\n\n\n@deprecated(\"step\")\ndef tracked(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]]:\n    return step(uid)\n"
  },
  {
    "path": "lamindb/core/_mapped_collection.py",
    "content": "from __future__ import annotations\n\nfrom collections import Counter\nfrom functools import reduce\nfrom typing import TYPE_CHECKING, Literal\n\nimport numpy as np\nimport pandas as pd\nfrom lamin_utils import logger\nfrom lamindb_setup.core.upath import UPath\n\nfrom .storage._anndata_accessor import (\n    ArrayType,\n    ArrayTypes,\n    GroupType,\n    GroupTypes,\n    StorageType,\n    _safer_read_index,\n    get_spec,\n    registry,\n)\n\nif TYPE_CHECKING:\n    from lamindb_setup.types import AnyPathStr\n\n\nclass _Connect:\n    def __init__(self, storage):\n        if isinstance(storage, UPath):\n            # force no external compression even for files with .gz extension. REMOVE LATER\n            self.conn, self.store = registry.open(\"h5py\", storage, compression=None)\n            self.to_close = True\n        else:\n            self.conn, self.store = None, storage\n            self.to_close = False\n\n    def __enter__(self):\n        return self.store\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        self.close()\n\n    def close(self):\n        if not self.to_close:\n            return\n        if hasattr(self.store, \"close\"):\n            self.store.close()\n        if hasattr(self.conn, \"close\"):\n            self.conn.close()\n\n\n_decode = np.frompyfunc(lambda x: x.decode(\"utf-8\"), 1, 1)\n\n\nclass MappedCollection:\n    \"\"\"Map-style collection for use in data loaders.\n\n    This class virtually concatenates `AnnData` arrays as a `pytorch map-style dataset\n    <https://pytorch.org/docs/stable/data.html#map-style-datasets>`__.\n\n    If your `AnnData` collection is in the cloud, move them into a local cache\n    first for faster access.\n\n    `__getitem__` of the `MappedCollection` object takes a single integer index\n    and returns a dictionary with the observation data sample for this index from\n    the `AnnData` objects in `path_list`. The dictionary has keys for `layers_keys`\n    (`.X` is in `\"X\"`), `obs_keys`, `obsm_keys` (under `f\"obsm_{key}\"`) and also `\"_store_idx\"`\n    for the index of the `AnnData` object containing this observation sample.\n\n    .. note::\n\n        For a guide, see :doc:`docs:scrna-mappedcollection`.\n\n        For more convenient use within :class:`~lamindb.core.MappedCollection`,\n        see :meth:`~lamindb.Collection.mapped`.\n\n        This currently only works for collections of `AnnData` objects.\n\n        The implementation was influenced by the `SCimilarity\n        <https://github.com/Genentech/scimilarity>`__ data loader.\n\n\n    Args:\n        path_list: A list of paths to `AnnData` objects stored in `.h5ad` or `.zarr` formats.\n        layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``\"X\"`` in the list\n            retrieves ``.X``. ``\"raw.X\"`` retrieves ``.X`` from ``.raw`` slot.\n            Keys not present in an object are omitted from the output for that object.\n        obsm_keys: Keys from the ``.obsm`` slots. Keys not present in an object are\n            omitted from the output for that object.\n        obs_keys: Keys from the ``.obs`` slots. Keys not present in an object are\n            omitted from the output for that object.\n        obs_filter: Select only observations with these values for the given obs columns.\n            Should be a dictionary with obs column names as keys\n            and filtering values (a string or a list of strings) as values.\n        join: `\"inner\"` or `\"outer\"` virtual joins. If ``None`` is passed,\n            does not join. The join is applied to ``layers_keys`` except for ``\"raw.X\"``.\n        encode_labels: Encode labels into integers.\n            Can be a list with elements from ``obs_keys``.\n        unknown_label: Encode this label to -1.\n            Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``\n            or from ``encode_labels`` if it is a list.\n        cache_categories: Enable caching categories of ``obs_keys`` for faster access.\n        parallel: Enable sampling with multiple processes.\n        dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``\n    \"\"\"\n\n    def __init__(\n        self,\n        path_list: list[AnyPathStr],\n        layers_keys: str | list[str] | None = None,\n        obs_keys: str | list[str] | None = None,\n        obsm_keys: str | list[str] | None = None,\n        obs_filter: dict[str, str | list[str]] | None = None,\n        join: Literal[\"inner\", \"outer\"] | None = \"inner\",\n        encode_labels: bool | list[str] = True,\n        unknown_label: str | dict[str, str] | None = None,\n        cache_categories: bool = True,\n        parallel: bool = False,\n        dtype: str | None = None,\n    ):\n        if join not in {None, \"inner\", \"outer\"}:  # pragma: nocover\n            raise ValueError(\n                f\"join must be one of None, 'inner, or 'outer' but was {type(join)}\"\n            )\n\n        self.filtered = obs_filter is not None\n        if self.filtered and not isinstance(obs_filter, dict):\n            logger.warning(\n                \"Passing a tuple to `obs_filter` is deprecated, use a dictionary\"\n            )\n            obs_filter = {obs_filter[0]: obs_filter[1]}\n\n        if layers_keys is None:\n            self.layers_keys = [\"X\"]\n        else:\n            self.layers_keys = (\n                [layers_keys] if isinstance(layers_keys, str) else layers_keys\n            )\n\n        obsm_keys = [obsm_keys] if isinstance(obsm_keys, str) else obsm_keys\n        self.obsm_keys = obsm_keys\n\n        obs_keys = [obs_keys] if isinstance(obs_keys, str) else obs_keys\n        self.obs_keys = obs_keys\n\n        if isinstance(encode_labels, list):\n            if len(encode_labels) == 0:\n                encode_labels = False\n            elif obs_keys is None or not all(\n                enc_label in obs_keys for enc_label in encode_labels\n            ):\n                raise ValueError(\n                    \"All elements of `encode_labels` should be in `obs_keys`.\"\n                )\n        else:\n            if encode_labels:\n                encode_labels = obs_keys if obs_keys is not None else False\n        self.encode_labels = encode_labels\n\n        if encode_labels and isinstance(unknown_label, dict):\n            if not all(unkey in encode_labels for unkey in unknown_label):  # type: ignore\n                raise ValueError(\n                    \"All keys of `unknown_label` should be in `encode_labels` and `obs_keys`.\"\n                )\n        self.unknown_label = unknown_label\n\n        self.storages = []  # type: ignore\n        self.conns = []  # type: ignore\n        self.parallel = parallel\n        self.path_list = path_list\n        self._make_connections(path_list, parallel)\n\n        self._cache_has_raw: list[bool] = []\n        self._cache_obsm_keys: list[set[str]] = []\n        self._cache_obs_keys: list[set[str]] = []\n        self._cache_layers_keys: list[set[str]] = []\n        self._cache_keys()\n\n        self._cache_cats: dict = {}\n        if self.obs_keys is not None:\n            if cache_categories:\n                self._cache_categories(self.obs_keys)\n            self.encoders: dict = {}\n            if self.encode_labels:\n                self._make_encoders(self.encode_labels)  # type: ignore\n\n        self.n_obs_list = []\n        self.indices_list = []\n        for i, storage in enumerate(self.storages):\n            with _Connect(storage) as store:\n                X = store[\"X\"]\n                store_path = self.path_list[i]\n                self._check_csc_raise_error(X, \"X\", store_path)\n                if isinstance(X, ArrayTypes):  # type: ignore\n                    n_obs_storage = X.shape[0]\n                else:\n                    n_obs_storage = X.attrs[\"shape\"][0]\n                if self.filtered:\n                    indices_storage_mask = None\n                    for obs_filter_key, obs_filter_values in obs_filter.items():\n                        if isinstance(obs_filter_values, tuple):\n                            obs_filter_values = list(obs_filter_values)\n                        elif not isinstance(obs_filter_values, list):\n                            obs_filter_values = [obs_filter_values]\n                        if obs_filter_key in store[\"obs\"]:\n                            obs_labels = self._get_labels(store, obs_filter_key)\n                            obs_filter_mask = np.isin(obs_labels, obs_filter_values)\n                        else:\n                            obs_filter_mask = np.full(n_obs_storage, False)\n                        if pd.isna(obs_filter_values).any():\n                            obs_filter_mask |= pd.isna(obs_labels)\n                        if indices_storage_mask is None:\n                            indices_storage_mask = obs_filter_mask\n                        else:\n                            indices_storage_mask &= obs_filter_mask\n                    indices_storage = np.where(indices_storage_mask)[0]\n                    n_obs_storage = len(indices_storage)\n                else:\n                    indices_storage = np.arange(n_obs_storage)\n                self.n_obs_list.append(n_obs_storage)\n                self.indices_list.append(indices_storage)\n                for layer_key in self.layers_keys:\n                    if layer_key == \"X\":\n                        continue\n                    lazy_data = self._get_lazy_data(store, layer_key, i)\n                    if lazy_data is None:\n                        continue\n                    self._check_csc_raise_error(\n                        lazy_data,\n                        \"raw.X\" if layer_key == \"raw.X\" else f\"layers/{layer_key}\",\n                        store_path,\n                    )\n                if self.obsm_keys is not None:\n                    for obsm_key in self.obsm_keys:\n                        if obsm_key in self._cache_obsm_keys[i]:\n                            self._check_csc_raise_error(\n                                store[\"obsm\"][obsm_key],\n                                f\"obsm/{obsm_key}\",\n                                store_path,\n                            )\n        self.n_obs = sum(self.n_obs_list)\n\n        self.indices = np.hstack(self.indices_list)\n        self.storage_idx = np.repeat(np.arange(len(self.storages)), self.n_obs_list)\n\n        self.join_vars: Literal[\"inner\", \"outer\"] | None = join\n        self.var_indices: list | None = None\n        self.var_joint: pd.Index | None = None\n        self.n_vars_list: list | None = None\n        self.var_list: list | None = None\n        self.n_vars: int | None = None\n        if self.join_vars is not None:\n            self._make_join_vars()\n            self.n_vars = len(self.var_joint)\n\n        self._dtype = dtype\n        self._closed = False\n\n    def _make_connections(self, path_list: list, parallel: bool):\n        for path in path_list:\n            path = UPath(path)\n            if path.exists() and path.is_file():  # type: ignore\n                if parallel:\n                    conn, storage = None, path\n                else:\n                    # force no external compression even for files with .gz extension. REMOVE LATER\n                    conn, storage = registry.open(\"h5py\", path, compression=None)\n            else:\n                conn, storage = registry.open(\"zarr\", path)\n            self.conns.append(conn)\n            self.storages.append(storage)\n\n    def _cache_keys(self):\n        for storage in self.storages:\n            with _Connect(storage) as store:\n                store_keys = registry.keys(store)\n                self._cache_has_raw.append(\"raw\" in store_keys)\n                for group in (\"obsm\", \"obs\", \"layers\"):\n                    cache = getattr(self, f\"_cache_{group}_keys\")\n                    cache.append(\n                        set(store_keys[group]) if group in store_keys else set()\n                    )\n\n    def _cache_categories(self, obs_keys: list):\n        self._cache_cats = {}\n        for label in obs_keys:\n            self._cache_cats[label] = []\n            for i, storage in enumerate(self.storages):\n                if label not in self._cache_obs_keys[i]:\n                    self._cache_cats[label].append(None)\n                    continue\n                with _Connect(storage) as store:\n                    cats = self._get_categories(store, label)\n                    if cats is not None:\n                        cats = (\n                            _decode(cats) if isinstance(cats[0], bytes) else cats[...]\n                        )\n                    self._cache_cats[label].append(cats)\n\n    def _make_encoders(self, encode_labels: list):\n        for label in encode_labels:\n            cats = self.get_merged_categories(label)\n            encoder = {}\n            if isinstance(self.unknown_label, dict):\n                unknown_label = self.unknown_label.get(label, None)\n            else:\n                unknown_label = self.unknown_label\n            if unknown_label is not None and unknown_label in cats:\n                cats.remove(unknown_label)\n                encoder[unknown_label] = -1\n            encoder.update({cat: i for i, cat in enumerate(cats)})\n            self.encoders[label] = encoder\n\n    def _read_vars(self):\n        self.var_list = []\n        self.n_vars_list = []\n        for storage in self.storages:\n            with _Connect(storage) as store:\n                vars = _safer_read_index(store[\"var\"])\n                self.var_list.append(vars)\n                self.n_vars_list.append(len(vars))\n\n    def _make_join_vars(self):\n        if self.var_list is None:\n            self._read_vars()\n        vars_eq = all(self.var_list[0].equals(vrs) for vrs in self.var_list[1:])\n        if vars_eq:\n            self.join_vars = None\n            self.var_joint = self.var_list[0]\n            return\n\n        if self.join_vars == \"inner\":\n            self.var_joint = reduce(pd.Index.intersection, self.var_list)\n            if len(self.var_joint) == 0:\n                raise ValueError(\n                    \"The provided AnnData objects don't have shared variables.\\n\"\n                    \"Use join='outer'.\"\n                )\n            self.var_indices = [\n                vrs.get_indexer(self.var_joint) for vrs in self.var_list\n            ]\n        elif self.join_vars == \"outer\":\n            self.var_joint = reduce(pd.Index.union, self.var_list)\n            self.var_indices = [\n                self.var_joint.get_indexer(vrs) for vrs in self.var_list\n            ]\n\n    def check_vars_sorted(self, ascending: bool = True) -> bool:\n        \"\"\"Returns `True` if all variables are sorted in all objects.\"\"\"\n        if self.var_list is None:\n            self._read_vars()\n        if ascending:\n            vrs_sort_status = (vrs.is_monotonic_increasing for vrs in self.var_list)\n        else:\n            vrs_sort_status = (vrs.is_monotonic_decreasing for vrs in self.var_list)\n        return all(vrs_sort_status)\n\n    def check_vars_non_aligned(self, vars: pd.Index | list) -> list[int]:\n        \"\"\"Returns indices of objects with non-aligned variables.\n\n        Args:\n            vars: Check alignment against these variables.\n        \"\"\"\n        if self.var_list is None:\n            self._read_vars()\n        vars = pd.Index(vars)\n        return [i for i, vrs in enumerate(self.var_list) if not vrs.equals(vars)]\n\n    def _check_csc_raise_error(\n        self, elem: GroupType | ArrayType, key: str, path: AnyPathStr\n    ):\n        if isinstance(elem, ArrayTypes):  # type: ignore\n            return\n        if get_spec(elem).encoding_type == \"csc_matrix\":\n            if not self.parallel:\n                self.close()\n            raise ValueError(\n                f\"{key} in {path} is a csc matrix, `MappedCollection` doesn't support this format yet.\"\n            )\n\n    def __len__(self):\n        return self.n_obs\n\n    @property\n    def shape(self) -> tuple[int, int]:\n        \"\"\"Shape of the (virtually aligned) dataset.\"\"\"\n        return (self.n_obs, self.n_vars)\n\n    @property\n    def original_shapes(self) -> list[tuple[int, int]]:\n        \"\"\"Shapes of the underlying AnnData objects (with `obs_filter` applied).\"\"\"\n        if self.n_vars_list is None:\n            n_vars_list = [None] * len(self.n_obs_list)\n        else:\n            n_vars_list = self.n_vars_list\n        return list(zip(self.n_obs_list, n_vars_list))\n\n    def __getitem__(self, idx: int):\n        obs_idx = self.indices[idx]\n        storage_idx = self.storage_idx[idx]\n        if self.var_indices is not None:\n            var_idxs_join = self.var_indices[storage_idx]\n        else:\n            var_idxs_join = None\n        out = {\"_store_idx\": storage_idx}\n        with _Connect(self.storages[storage_idx]) as store:\n            for layers_key in self.layers_keys:\n                lazy_data = self._get_lazy_data(store, layers_key, storage_idx)\n                if lazy_data is None:\n                    continue\n                # do not apply join to raw.X, return as is\n                join_vars = None if layers_key == \"raw.X\" else self.join_vars\n                out[layers_key] = self._get_data_idx(\n                    lazy_data, obs_idx, join_vars, var_idxs_join, self.n_vars\n                )\n            if self.obsm_keys is not None:\n                for obsm_key in self.obsm_keys:\n                    if obsm_key not in self._cache_obsm_keys[storage_idx]:\n                        continue\n                    lazy_data = store[\"obsm\"][obsm_key]\n                    out[f\"obsm_{obsm_key}\"] = self._get_data_idx(lazy_data, obs_idx)\n            if self.obs_keys is not None:\n                for label in self.obs_keys:\n                    if label not in self._cache_obs_keys[storage_idx]:\n                        continue\n                    if label in self._cache_cats:\n                        cats = self._cache_cats[label][storage_idx]\n                        if cats is None:\n                            cats = []\n                    else:\n                        cats = None\n                    label_idx = self._get_obs_idx(store, obs_idx, label, cats)\n                    if label in self.encoders and label_idx is not np.nan:\n                        label_idx = self.encoders[label][label_idx]\n                    out[label] = label_idx\n        return out\n\n    def _get_lazy_data(self, store: StorageType, layers_key: str, storage_idx: int):\n        if layers_key == \"X\":\n            lazy_data = store[\"X\"]  # type: ignore\n        elif layers_key == \"raw.X\" and self._cache_has_raw[storage_idx]:\n            lazy_data = store[\"raw\"][\"X\"]  # type: ignore\n        elif layers_key in self._cache_layers_keys[storage_idx]:\n            lazy_data = store[\"layers\"][layers_key]  # type: ignore\n        else:\n            lazy_data = None\n        return lazy_data\n\n    def _get_data_idx(\n        self,\n        lazy_data: ArrayType | GroupType,\n        idx: int,\n        join_vars: Literal[\"inner\", \"outer\"] | None = None,\n        var_idxs_join: list | None = None,\n        n_vars_out: int | None = None,\n    ):\n        \"\"\"Get the index for the data.\"\"\"\n        if isinstance(lazy_data, ArrayTypes):  # type: ignore\n            lazy_data_idx = lazy_data[idx]  # type: ignore\n            if join_vars is None:\n                result = lazy_data_idx\n                if self._dtype is not None:\n                    result = result.astype(self._dtype, copy=False)\n            elif join_vars == \"outer\":\n                dtype = lazy_data_idx.dtype if self._dtype is None else self._dtype\n                result = np.zeros(n_vars_out, dtype=dtype)\n                result[var_idxs_join] = lazy_data_idx\n            else:  # inner join\n                result = lazy_data_idx[var_idxs_join]\n                if self._dtype is not None:\n                    result = result.astype(self._dtype, copy=False)\n            return result\n        else:  # assume csr_matrix here\n            data = lazy_data[\"data\"]  # type: ignore\n            indices = lazy_data[\"indices\"]  # type: ignore\n            indptr = lazy_data[\"indptr\"]  # type: ignore\n            s = slice(*(indptr[idx : idx + 2]))\n            data_s = data[s]\n            dtype = data_s.dtype if self._dtype is None else self._dtype\n            if join_vars == \"outer\":\n                lazy_data_idx = np.zeros(n_vars_out, dtype=dtype)\n                lazy_data_idx[var_idxs_join[indices[s]]] = data_s\n            else:\n                lazy_data_idx = np.zeros(lazy_data.attrs[\"shape\"][1], dtype=dtype)  # type: ignore\n                lazy_data_idx[indices[s]] = data_s\n                if join_vars == \"inner\":\n                    lazy_data_idx = lazy_data_idx[var_idxs_join]\n            return lazy_data_idx\n\n    def _get_obs_idx(\n        self,\n        storage: StorageType,\n        idx: int,\n        label_key: str,\n        categories: list | None = None,\n    ):\n        \"\"\"Get the index for the label by key.\"\"\"\n        obs = storage[\"obs\"]  # type: ignore\n        # how backwards compatible do we want to be here actually?\n        if isinstance(obs, ArrayTypes):  # type: ignore\n            label = obs[idx][obs.dtype.names.index(label_key)]\n        else:\n            labels = obs[label_key]\n            if isinstance(labels, ArrayTypes):  # type: ignore\n                label = labels[idx]\n            else:\n                label = labels[\"codes\"][idx]\n                if label == -1:\n                    return np.nan\n        if categories is not None:\n            cats = categories\n        else:\n            cats = self._get_categories(storage, label_key)\n        if cats is not None and len(cats) > 0:\n            label = cats[label]\n        if isinstance(label, bytes):\n            label = label.decode(\"utf-8\")\n        return label\n\n    def get_label_weights(\n        self,\n        obs_keys: str | list[str],\n        scaler: float | None = None,\n        return_categories: bool = False,\n    ):\n        \"\"\"Get all weights for the given label keys.\n\n        This counts the number of labels for each label and returns\n        weights for each obs label accoding to the formula `1 / num of this label in the data`.\n        If `scaler` is provided, then `scaler / (scaler + num of this label in the data)`.\n\n        Args:\n            obs_keys: A key in the ``.obs`` slots or a list of keys. If a list is provided,\n                the labels from the obs keys will be concatenated with ``\"__\"`` delimeter\n            scaler: Use this number to scale the provided weights.\n            return_categories: If `False`, returns weights for each observation,\n                can be directly passed to a sampler. If `True`, returns a dictionary with\n                unique categories for labels (concatenated if `obs_keys` is a list)\n                and their weights.\n        \"\"\"\n        if isinstance(obs_keys, str):\n            obs_keys = [obs_keys]\n        labels_list = []\n        for label_key in obs_keys:\n            labels_to_str = self.get_merged_labels(label_key).astype(str).astype(\"O\")\n            labels_list.append(labels_to_str)\n        if len(labels_list) > 1:\n            labels = [\"__\".join(labels_obs) for labels_obs in zip(*labels_list)]\n        else:\n            labels = labels_list[0]\n        counter = Counter(labels)\n        if return_categories:\n            return {\n                k: 1.0 / v if scaler is None else scaler / (v + scaler)\n                for k, v in counter.items()\n            }\n        counts = np.array([counter[label] for label in labels])\n        if scaler is None:\n            weights = 1.0 / counts\n        else:\n            weights = scaler / (counts + scaler)\n        return weights\n\n    def get_merged_labels(self, label_key: str):\n        \"\"\"Get merged labels for `label_key` from all `.obs`.\"\"\"\n        labels_merge = []\n        for i, storage in enumerate(self.storages):\n            with _Connect(storage) as store:\n                if label_key not in self._cache_obs_keys[i]:\n                    continue\n                labels = self._get_labels(store, label_key, storage_idx=i)\n                if self.filtered:\n                    labels = labels[self.indices_list[i]]\n                labels_merge.append(labels)\n        return np.hstack(labels_merge)\n\n    def get_merged_categories(self, label_key: str):\n        \"\"\"Get merged categories for `label_key` from all `.obs`.\"\"\"\n        cats_merge = set()\n        for i, storage in enumerate(self.storages):\n            with _Connect(storage) as store:\n                if label_key not in self._cache_obs_keys[i]:\n                    continue\n                if label_key in self._cache_cats:\n                    cats = self._cache_cats[label_key][i]\n                else:\n                    cats = self._get_categories(store, label_key)\n                if cats is not None:\n                    cats = _decode(cats) if isinstance(cats[0], bytes) else cats\n                    cats_merge.update(cats)\n                else:\n                    codes = self._get_codes(store, label_key)\n                    codes = _decode(codes) if isinstance(codes[0], bytes) else codes\n                    cats_merge.update(codes)\n        return sorted(cats_merge)\n\n    def _get_categories(self, storage: StorageType, label_key: str):\n        \"\"\"Get categories.\"\"\"\n        obs = storage[\"obs\"]  # type: ignore\n        if isinstance(obs, ArrayTypes):  # type: ignore\n            cat_key_uns = f\"{label_key}_categories\"\n            if cat_key_uns in storage[\"uns\"]:  # type: ignore\n                return storage[\"uns\"][cat_key_uns]  # type: ignore\n            else:\n                return None\n        else:\n            if \"__categories\" in obs:\n                cats = obs[\"__categories\"]\n                if label_key in cats:\n                    return cats[label_key]\n                else:\n                    return None\n            if label_key not in obs:\n                return None\n            labels = obs[label_key]\n            if isinstance(labels, GroupTypes):  # type: ignore\n                if \"categories\" in labels:\n                    return labels[\"categories\"]\n                else:\n                    return None\n            else:\n                if \"categories\" in labels.attrs:\n                    return labels.attrs[\"categories\"]\n                else:\n                    return None\n        return None\n\n    def _get_codes(self, storage: StorageType, label_key: str):\n        \"\"\"Get codes.\"\"\"\n        obs = storage[\"obs\"]  # type: ignore\n        if isinstance(obs, ArrayTypes):  # type: ignore\n            label = obs[label_key]\n        else:\n            label = obs[label_key]\n            if isinstance(label, ArrayTypes):  # type: ignore\n                return label[...]\n            else:\n                return label[\"codes\"][...]\n\n    def _get_labels(\n        self, storage: StorageType, label_key: str, storage_idx: int | None = None\n    ):\n        \"\"\"Get labels.\"\"\"\n        codes = self._get_codes(storage, label_key)\n        labels = _decode(codes) if isinstance(codes[0], bytes) else codes\n        if storage_idx is not None and label_key in self._cache_cats:\n            cats = self._cache_cats[label_key][storage_idx]\n        else:\n            cats = self._get_categories(storage, label_key)\n        if cats is not None:\n            cats = _decode(cats) if isinstance(cats[0], bytes) else cats\n            # NaN is coded as -1\n            nans = labels == -1\n            labels = cats[labels]\n            # detect and replace nans\n            if nans.any():\n                labels[nans] = np.nan\n\n        return labels\n\n    def close(self):\n        \"\"\"Close connections to array streaming backend.\n\n        No effect if `parallel=True`.\n        \"\"\"\n        for storage in self.storages:\n            if hasattr(storage, \"close\"):\n                storage.close()\n        for conn in self.conns:\n            if hasattr(conn, \"close\"):\n                conn.close()\n        self._closed = True\n\n    @property\n    def closed(self) -> bool:\n        \"\"\"Check if connections to array streaming backend are closed.\n\n        Does not matter if `parallel=True`.\n        \"\"\"\n        return self._closed\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        self.close()\n\n    @classmethod\n    def torch_worker_init_fn(cls, worker_id):\n        \"\"\"`worker_init_fn` for `torch.utils.data.DataLoader`.\n\n        Improves performance for `num_workers > 1`.\n        \"\"\"\n        from torch.utils.data import get_worker_info\n\n        mapped = get_worker_info().dataset\n        mapped.parallel = False\n        mapped.storages = []\n        mapped.conns = []\n        mapped._make_connections(mapped.path_list, parallel=False)\n"
  },
  {
    "path": "lamindb/core/_settings.py",
    "content": "from __future__ import annotations\n\nimport os\nimport sys\nfrom typing import TYPE_CHECKING\n\nimport lamindb_setup as ln_setup\nfrom lamin_utils import colors, logger\nfrom lamindb_setup import settings as setup_settings\nfrom lamindb_setup._set_managed_storage import set_managed_storage\nfrom lamindb_setup.core._settings_instance import sanitize_git_repo_url\nfrom lamindb_setup.core._settings_storage import (\n    StorageSettings,\n    convert_root_path_to_str,\n)\n\nfrom .subsettings._annotation_settings import AnnotationSettings, annotation_settings\nfrom .subsettings._creation_settings import CreationSettings, creation_settings\n\nif TYPE_CHECKING:\n    from collections.abc import Mapping\n    from pathlib import Path\n\n    from lamindb_setup.types import AnyPathStr\n    from upath import UPath\n\n\nVERBOSITY_TO_INT = {\n    \"error\": 0,  # 40\n    \"warning\": 1,  # 30\n    \"success\": 2,  # 25\n    \"info\": 3,  # 20\n    \"hint\": 4,  # 15\n    \"debug\": 5,  # 10\n}\nVERBOSITY_TO_STR: dict[int, str] = dict(\n    [reversed(i) for i in VERBOSITY_TO_INT.items()]  # type: ignore\n)\n\n\ndef raise_if_storage_managed_by_other_instance(storage) -> None:\n    storage_instance_uid = storage.instance_uid\n    if storage_instance_uid != setup_settings.instance.uid:\n        raise ValueError(\n            f\"Storage '{storage.root}' exists in another instance ({storage_instance_uid}), cannot write to it from here.\"\n        )\n\n\nclass Settings:\n    \"\"\"Settings.\n\n    Please use the global `ln.settings` object instead of instantiating this class yourself.\n    \"\"\"\n\n    def __init__(self):\n        self._verbosity_int: int = logger._verbosity\n        self._sync_git_repo: str | None = None\n\n    def __repr__(self) -> str:  # pragma: no cover\n        if \"sphinx\" in sys.modules:\n            return object.__repr__(self)\n\n        cls_name = colors.green(self.__class__.__name__)\n        verbosity_color = colors.yellow if self.verbosity == \"warning\" else colors.green\n        verbosity_str = verbosity_color(self.verbosity)\n\n        storage_root = self._storage_settings.root_as_str\n        storage_str = colors.italic(storage_root)\n\n        instance_str = colors.italic(self.instance_uid)\n        track_color = colors.green if self.track_run_inputs else colors.yellow\n        track_str = track_color(str(self.track_run_inputs))\n\n        lines = [\n            f\"{cls_name}\",\n            f\"  instance: {instance_str}\",\n            f\"  storage: {storage_str}\",\n            f\"  verbosity: {verbosity_str}\",\n            f\"  track_run_inputs: {track_str}\",\n        ]\n\n        if self.sync_git_repo:\n            repo_name = (\n                self.sync_git_repo.split(\"/\")[-1]\n                if \"/\" in self.sync_git_repo\n                else self.sync_git_repo\n            )\n            lines.append(f\"  sync_git_repo: {colors.italic(repo_name)}\")\n\n        return \"\\n\".join(lines)\n\n    @property\n    def creation(self) -> CreationSettings:\n        \"\"\"SQLRecord creation settings.\n\n        For example, `ln.settings.creation.search_names = False` will disable\n        searching for records with similar names during creation.\n        \"\"\"\n        return creation_settings\n\n    @property\n    def annotation(self) -> AnnotationSettings:\n        \"\"\"Artifact annotation settings.\n\n        For example, `ln.settings.creation.search_names = False` will disable\n        searching for records with similar names during creation.\n        \"\"\"\n        return annotation_settings\n\n    # note: this setting should probably be deprecated soon\n    # warnings could then be filtered with a regular warning mechanism\n    track_run_inputs: bool = True\n    \"\"\"Track run inputs (default `True`).\n\n    If this setting is true, an artifact is recorded as run input upon `.load()`, `.cache()` & `.open()` provided :func:`~lamindb.track` was called in the current compute (Python, R) session.\n    If :func:`~lamindb.track` was not called, you receive a warning message upon `.load()`, `.cache()` & `.open()`.\n\n    If you switch this setting to `False`, you won't see the warning message anymore and no run inputs will be recorded.\n\n    FAQ: :doc:`/faq/track-run-inputs`\n    \"\"\"\n    __using_key: str | None = None\n    _using_storage: str | None = None\n\n    @property\n    def _using_key(self) -> str | None:\n        \"\"\"Key for Django database settings.\"\"\"\n        return self.__using_key\n\n    @_using_key.setter\n    def _using_key(self, value: str | None):\n        ln_setup.settings._using_key = value\n        self.__using_key = value\n\n    @property\n    def _storage_settings(self) -> ln_setup.core.StorageSettings:\n        if self._using_storage is None:\n            storage_settings = ln_setup.settings.storage\n        else:\n            storage_settings = ln_setup.core.StorageSettings(root=self._using_storage)\n        return storage_settings\n\n    @property\n    def sync_git_repo(self) -> str | None:\n        \"\"\"Sync transforms with scripts in git repository.\n\n        If set, scripts will be synced with the specified git repository.\n\n        Example::\n\n            ln.settings.sync_git_repo = https://github.com/laminlabs/schmidt22\n\n        You can also pass the git repo URL via the environment variable `LAMINDB_SYNC_GIT_REPO`::\n\n            export LAMINDB_SYNC_GIT_REPO=https://github.com/laminlabs/schmidt22\n\n        You'll then see::\n\n            ln.settings.sync_git_repo\n            #> 'https://github.com/laminlabs/schmidt22'\n\n        \"\"\"\n        if self._sync_git_repo is not None:\n            return self._sync_git_repo\n        elif os.environ.get(\"LAMINDB_SYNC_GIT_REPO\") is not None:\n            return sanitize_git_repo_url(os.environ[\"LAMINDB_SYNC_GIT_REPO\"])\n        else:\n            return setup_settings.instance.git_repo\n\n    @sync_git_repo.setter\n    def sync_git_repo(self, value) -> None:\n        self._sync_git_repo = sanitize_git_repo_url(value)\n        if not self._sync_git_repo.startswith(\"https://\"):  # pragma: nocover\n            raise ValueError(\"git repository URL must start with 'https://'.\")\n\n    @property\n    def storage(self) -> StorageSettings:\n        \"\"\"Current default storage location for writes.\n\n        Examples:\n\n        Retrieve the storage settings::\n\n            ln.settings.storage\n            #> StorageSettings(root='s3://my-bucket')\n\n        Retrieve the storage root::\n\n            ln.settings.storage.root\n            #> UPath('s3://my-bucket')\n\n        Switch the current default storage location::\n\n            ln.settings.storage = \"s3://some-bucket\"\n\n        Pass additional `fsspec` `kwargs` via::\n\n            kwargs = dict(\n                profile=\"some_profile\", # fsspec arg\n                cache_regions=True # fsspec arg for s3\n            )\n            ln.settings.storage = \"s3://some-bucket\", kwargs\n        \"\"\"\n        return self._storage_settings\n\n    @storage.setter\n    def storage(self, path_kwargs: AnyPathStr | tuple[AnyPathStr, Mapping]):\n        from ..models import Storage\n\n        if isinstance(path_kwargs, tuple):\n            path, kwargs = path_kwargs\n            if isinstance(kwargs, str):\n                kwargs = {\"host\": kwargs}\n        else:\n            path, kwargs = path_kwargs, {}\n        root_as_str = convert_root_path_to_str(path)\n        exists = Storage.filter(root=root_as_str).one_or_none()\n        if exists is None:\n            response = input(\n                f\"Storage location {root_as_str} does not yet exist in the current instance. Do you want to continue with creating it? (y/n) \"\n            )\n            # logger.warning(f\"deprecated call because storage location does **not yet** exist; please create through ln.Storage(root={path}).save()\")\n            if response != \"y\":\n                return None\n            set_managed_storage(path, **kwargs)\n        else:\n            raise_if_storage_managed_by_other_instance(exists)\n            ssettings = StorageSettings(\n                root=exists.root,\n                region=exists.region,\n                uid=exists.uid,\n                instance_id=ln_setup.settings.instance._id,\n            )\n            ln_setup.settings.instance._storage = ssettings\n            kwargs.pop(\"host\", None)  # host is not needed for existing storage\n            settings.storage._set_fs_kwargs(**kwargs)\n\n    @property\n    def instance_uid(self) -> str:\n        \"\"\"The `uid` of the current instance.\"\"\"\n        return ln_setup.settings.instance.uid\n\n    @property\n    def cache_dir(self) -> UPath:\n        \"\"\"Cache root, a local directory to cache cloud files.\"\"\"\n        return ln_setup.settings.cache_dir\n\n    @property\n    def local_storage(self) -> StorageSettings:\n        \"\"\"An additional local default storage (a path to its root).\n\n        Is only available if :attr:`~lamindb.setup.core.InstanceSettings.keep_artifacts_local` is enabled.\n\n        Guide: :doc:`faq/keep-artifacts-local`\n        \"\"\"\n        return ln_setup.settings.instance.local_storage\n\n    @local_storage.setter\n    def local_storage(self, local_root: Path | str):\n        import lamindb as ln\n\n        # note duplication with storage setter!\n        ssettings = StorageSettings(root=local_root)\n        exists = ln.Storage.filter(root=ssettings.root_as_str).one_or_none()\n        if exists is None:\n            response = input(\n                f\"Storage location {ssettings.root_as_str} does not yet exist. Do you want to continue with creating it? (y/n) \"\n            )\n            # logger.warning(f\"deprecated call because storage location does **not yet** exist; going forward, please create through ln.Storage(root={path}).save() going forward\")\n            if response != \"y\":\n                return None\n        else:\n            raise_if_storage_managed_by_other_instance(exists)\n        ln_setup.settings.instance.local_storage = local_root\n\n    @property\n    def verbosity(self) -> str:\n        \"\"\"Logger verbosity (default `'warning'`).\n\n        - `'error'`: only show error messages\n        - `'warning'`: also show warning messages\n        - `'success'`: also show success and save messages\n        - `'info'`: also show info messages\n        - `'hint'`: also show hint messages\n        - `'debug'`: also show detailed debug messages\n        \"\"\"\n        return VERBOSITY_TO_STR[self._verbosity_int]\n\n    @verbosity.setter\n    def verbosity(self, verbosity: str | int):\n        if isinstance(verbosity, str):\n            verbosity_int = VERBOSITY_TO_INT[verbosity]\n        else:\n            verbosity_int = verbosity\n        self._verbosity_int = verbosity_int\n        logger.set_verbosity(verbosity_int)\n\n\nsettings = Settings()\n"
  },
  {
    "path": "lamindb/core/_sync_git.py",
    "content": "from __future__ import annotations\n\nimport subprocess\nfrom pathlib import Path\n\nfrom lamin_utils import logger\nfrom lamindb_setup import settings as setup_settings\nfrom lamindb_setup.core.hashing import hash_code\n\nfrom ..core._settings import sanitize_git_repo_url, settings\nfrom ..errors import BlobHashNotFound\n\n\ndef get_git_repo_from_remote(url: str | None = None, depth: int | None = 10) -> Path:\n    \"\"\"Clone the git repository if not already cloned.\n\n    If `depth` is provided, a shallow clone is performed and no tags are fetched.\n    \"\"\"\n    repo_url = url or settings.sync_git_repo\n    repo_dir = setup_settings.cache_dir / repo_url.split(\"/\")[-1]\n    if repo_dir.exists():\n        logger.debug(f\"git repo {repo_dir} already exists locally\")\n        return repo_dir\n    logger.important(\n        f\"running outside of synched git repo, cloning {repo_url} into {repo_dir}\"\n    )\n    args = [\"git\", \"clone\", f\"{repo_url}.git\"]\n    if depth is not None:\n        # if depth is provided, will not fetch tags\n        args += [\"--depth\", f\"{depth}\"]\n    result = subprocess.run(\n        args,\n        capture_output=True,\n        cwd=setup_settings.cache_dir,\n    )\n    if result.returncode != 0 or not repo_dir.exists():\n        raise RuntimeError(result.stderr.decode())\n    return repo_dir\n\n\ndef check_local_git_repo() -> bool:\n    result = subprocess.run(\n        [\"git\", \"config\", \"--get\", \"remote.origin.url\"],\n        capture_output=True,\n    )\n    result_str = result.stdout.decode().strip()\n    if result_str == \"\":\n        # running-not-in-a-git-repo\n        return False\n    else:\n        remote_url = sanitize_git_repo_url(result_str)\n        if remote_url == settings.sync_git_repo:\n            # running-in-correct-git-repo\n            return True\n        else:\n            logger.warning(\n                f\"running in git repo: {remote_url}, expected: {settings.sync_git_repo}\"\n            )\n            return False\n\n\ndef get_git_commit_hash(blob_hash: str, repo_dir: Path | None = None) -> str | None:\n    # Fetch all remote branches so that we can also search them\n    fetch_command = [\"git\", \"fetch\", \"origin\", \"+refs/heads/*:refs/remotes/origin/*\"]\n    subprocess.run(fetch_command, cwd=repo_dir, check=True)\n\n    # Find the commit containing the blob hash in all branches\n    command = [\n        \"git\",\n        \"log\",\n        \"--all\",\n        f\"--find-object={blob_hash}\",\n        \"--pretty=format:%H\",\n    ]\n    result = subprocess.run(\n        command,\n        capture_output=True,\n        cwd=repo_dir,\n    )\n    # We just care to find one commit\n    # Hence, we split by new line (\"\\n\") and use the first one\n    commit_hash = result.stdout.decode().split(\"\\n\")[0]\n\n    if not commit_hash or result.returncode == 1:\n        return None\n\n    default_branch = (\n        subprocess.run(\n            [\"git\", \"rev-parse\", \"--abbrev-ref\", \"origin/HEAD\"],\n            capture_output=True,\n            cwd=repo_dir,\n            text=True,\n        )\n        .stdout.strip()\n        .split(\"/\")[-1]\n    )\n\n    # Find all branches containing the commit\n    commit_containing_branches = subprocess.run(\n        [\"git\", \"branch\", \"--all\", \"--contains\", commit_hash],\n        capture_output=True,\n        cwd=repo_dir,\n        text=True,\n    ).stdout.split(\"\\n\")\n\n    # Clean up branch names and filter out the default branch\n    commit_containing_branches = [\n        branch.strip().replace(\"remotes/\", \"\")\n        for branch in commit_containing_branches\n        if branch.strip()\n    ]\n    non_default_branches = [\n        branch for branch in commit_containing_branches if default_branch not in branch\n    ]\n\n    if non_default_branches:\n        logger.warning(\n            f\"code blob hash {blob_hash} was found in non-default branch(es): {', '.join(non_default_branches)}\"\n        )\n\n    assert (  # noqa: S101\n        len(commit_hash) == 40\n    ), f\"commit hash |{commit_hash}| is not 40 characters long\"\n\n    return commit_hash\n\n\ndef get_filepath_within_git_repo(\n    commit_hash: str, blob_hash: str, repo_dir: Path | None\n) -> str:\n    # repo_dir might not point to the root of the\n    # the git repository because git log --find-object works\n    # from anywhere in the repo, hence, let's get the root\n    repo_root = (\n        subprocess.run(\n            [\"git\", \"rev-parse\", \"--show-toplevel\"],\n            capture_output=True,\n            cwd=repo_dir,\n        )\n        .stdout.decode()\n        .strip()\n    )\n    # Run the git commands separately to circumvent spawning a shell\n    git_command = [\"git\", \"ls-tree\", \"-r\", commit_hash]\n    git_process = subprocess.Popen(\n        git_command,\n        stdout=subprocess.PIPE,\n        cwd=repo_root,\n    )\n\n    grep_command = [\"grep\", \"-E\", blob_hash]\n    result = subprocess.run(\n        grep_command,\n        stdin=git_process.stdout,\n        capture_output=True,\n        cwd=repo_root,\n    )\n\n    # Close the stdout to allow git_process to receive a SIGPIPE if grep_command exits\n    git_process.stdout.close()\n    git_process.wait()\n\n    command = \" \".join(git_command) + \" | \" + \" \".join(grep_command)\n    if result.returncode != 0 and result.stderr.decode() != \"\":\n        raise RuntimeError(f\"{command}\\n{result.stderr.decode()}\")\n    if len(result.stdout.decode()) == 0:\n        raise RuntimeError(\n            f\"Could not find path in git repo {settings.sync_git_repo} running:\\n{command}\"\n            f\"\\nin local clone: {repo_root}\"\n        )\n    filepath = result.stdout.decode().split()[-1]\n    return filepath\n\n\ndef get_transform_reference_from_git_repo(path: Path) -> str:\n    blob_hash = hash_code(path).hexdigest()\n    commit_hash = None\n    if check_local_git_repo():\n        repo_dir = None\n    else:\n        repo_dir = get_git_repo_from_remote()\n    commit_hash = get_git_commit_hash(blob_hash, repo_dir=repo_dir)\n    if commit_hash is None:\n        if repo_dir is None:\n            repo_dir = Path.cwd()\n        raise BlobHashNotFound(\n            f\"❌ Did not find blob hash {blob_hash} in git repo: {settings.sync_git_repo}\\n\"\n            f\"Did you commit & push the script to the remote repo? -> {path}\"\n        )\n    gitpath = get_filepath_within_git_repo(commit_hash, blob_hash, repo_dir)\n    reference = f\"{settings.sync_git_repo}/blob/{commit_hash}/{gitpath}\"\n    return reference\n\n\ndef get_and_validate_git_metadata(\n    url: str,\n    path: str,\n    version: str | None = None,\n    branch: str | None = None,\n) -> tuple[str, str]:\n    \"\"\"Get metadata from a git repository.\n\n    Args:\n        url: Git repository URL (e.g., \"https://github.com/user/repo\")\n        path: Path to the main script within the repository\n        version: Optional version/tag to checkout\n        branch: Optional branch name (defaults to repository's default branch)\n\n    Returns:\n        Dictionary containing:\n            - commit_hash: The current commit hash\n            - url: The repository URL\n            - main_script: Path to the main script\n            - revision: The version/tag (if provided)\n            - branch: The branch name\n\n    Raises:\n        RuntimeError: If git operations fail\n        FileNotFoundError: If the specified path does not exist in the repository\n    \"\"\"\n    url = sanitize_git_repo_url(url)\n    repo_dir = get_git_repo_from_remote(url, depth=None)\n\n    # Determine the branch to use\n    if branch is None:\n        # Get the default branch if not specified\n        result_str = subprocess.run(\n            [\"git\", \"rev-parse\", \"--abbrev-ref\", \"origin/HEAD\"],\n            capture_output=True,\n            cwd=repo_dir,\n            text=True,\n        )\n        if result_str.returncode == 0:\n            branch = result_str.stdout.strip().split(\"/\")[-1]\n        else:\n            branch = \"main\"  # fallback to main\n\n    # Fetch the latest changes\n    subprocess.run(\n        [\"git\", \"fetch\", \"origin\"],\n        capture_output=True,\n        cwd=repo_dir,\n        check=True,\n    )\n\n    # Checkout the specified version or branch\n    if version is not None:\n        # Version takes precedence - checkout the tag/version\n        result = subprocess.run(\n            [\"git\", \"checkout\", version],\n            capture_output=True,\n            cwd=repo_dir,\n        )\n        if result.returncode != 0:\n            raise ValueError(\n                f\"Failed to checkout version {version}: {result.stderr.decode()}\"\n            )\n        logger.info(f\"checked out version {version}\")\n    else:\n        # Checkout the branch\n        result = subprocess.run(\n            [\"git\", \"checkout\", f\"origin/{branch}\"],\n            capture_output=True,\n            cwd=repo_dir,\n        )\n        if result.returncode != 0:\n            raise ValueError(\n                f\"Failed to checkout branch {branch}: {result.stderr.decode()}\"\n            )\n        logger.info(f\"checked out branch {branch}\")\n\n    # Get the current commit hash\n    result_str = subprocess.run(\n        [\"git\", \"rev-parse\", \"HEAD\"],\n        capture_output=True,\n        cwd=repo_dir,\n        text=True,\n    )\n    if result_str.returncode != 0:\n        raise RuntimeError(f\"Failed to get commit hash: {result_str.stderr}\")\n\n    commit_hash = result_str.stdout.strip()\n\n    assert (  # noqa: S101\n        len(commit_hash) == 40\n    ), f\"commit hash |{commit_hash}| is not 40 characters long\"\n\n    # Verify that the path exists as a file in the repository\n    file_path = repo_dir / path\n    if not file_path.exists():\n        raise FileNotFoundError(f\"Path '{path}' does not exist in repository {url}\")\n    if not file_path.is_file():\n        raise FileNotFoundError(\n            f\"Path '{path}' exists but is not a file in repository {url}\"\n        )\n    return url, commit_hash\n"
  },
  {
    "path": "lamindb/core/_track_environment.py",
    "content": "from __future__ import annotations\n\nimport subprocess\nimport sys\nfrom typing import TYPE_CHECKING\n\nimport lamindb_setup as ln_setup\nfrom lamin_utils import logger\n\nif TYPE_CHECKING:\n    from lamindb.models import Run\n\n\ndef track_python_environment(run: Run) -> None:\n    env_dir = ln_setup.settings.cache_dir / \"environments\" / f\"run_{run.uid}\"\n    filepath = env_dir / \"run_env_pip.txt\"\n    if not env_dir.exists():\n        filepath.parent.mkdir(parents=True)\n    # create a requirements.txt\n    # we don't create a conda environment.yml mostly for its slowness\n    try:\n        with open(filepath, \"w\") as f:\n            result = subprocess.run(\n                [sys.executable, \"-m\", \"pip\", \"freeze\"],\n                stdout=f,\n            )\n    except OSError as e:\n        result = None\n        logger.warning(f\"could not run pip freeze with error {e}\")\n    if result is not None and result.returncode == 0:\n        logger.info(f\"tracked pip freeze > {str(filepath)}\")\n"
  },
  {
    "path": "lamindb/core/exceptions.py",
    "content": "from ..errors import *  # noqa: F403 backward compat\n"
  },
  {
    "path": "lamindb/core/loaders.py",
    "content": "\"\"\"Loaders in :class:`lamindb.Artifact.load`.\n\n.. autodata:: SUPPORTED_SUFFIXES\n.. autofunction:: load_fcs\n.. autofunction:: load_tsv\n.. autofunction:: load_h5ad\n.. autofunction:: load_h5mu\n.. autofunction:: load_html\n.. autofunction:: load_json\n.. autofunction:: load_image\n.. autofunction:: load_svg\n\n\"\"\"\n\nfrom __future__ import annotations\n\nimport builtins\nimport re\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, Callable, cast\n\nfrom lamin_utils import logger\nfrom lamindb_setup import settings as setup_settings\nfrom lamindb_setup.core.upath import (\n    create_path,\n    extract_suffix_from_path,\n    infer_filesystem,\n)\n\nif TYPE_CHECKING:\n    from anndata import AnnData\n    from lamindb_setup.types import AnyPathStr\n    from mudata import MuData\n    from pandas import DataFrame\n\n    from lamindb.core.storage.types import ScverseDataStructures\n\n\nis_run_from_ipython = getattr(builtins, \"__IPYTHON__\", False)\n\n\n# tested in lamin-usecases\ndef load_fcs(*args, **kwargs) -> AnnData:\n    \"\"\"Load an `.fcs` file to `AnnData`.\"\"\"\n    try:\n        import readfcs\n    except ImportError:  # pragma: no cover\n        raise ImportError(\"Please install readfcs: pip install readfcs\") from None\n    return readfcs.read(*args, **kwargs)\n\n\n# for types below note that local UPaths are subclasses of Path\n# Path(UPath(...)) properly coerces local UPaths and throws an error for cloud UPaths\n\n\ndef load_csv(path: Path | str, **kwargs) -> DataFrame:\n    \"\"\"Load `.csv` file to `DataFrame`.\"\"\"\n    import pandas as pd\n\n    path_sanitized = Path(path)\n    return pd.read_csv(path_sanitized, **kwargs)\n\n\ndef load_parquet(path: Path | str, **kwargs) -> DataFrame:\n    \"\"\"Load `.parquet` file to `DataFrame`.\"\"\"\n    import pandas as pd\n\n    path_sanitized = Path(path)\n    return pd.read_parquet(path_sanitized, **kwargs)\n\n\ndef load_tsv(path: Path | str, **kwargs) -> DataFrame:\n    \"\"\"Load `.tsv` file to `DataFrame`.\"\"\"\n    import pandas as pd\n\n    path_sanitized = Path(path)\n    return pd.read_csv(path_sanitized, sep=\"\\t\", **kwargs)\n\n\ndef load_h5ad(filepath: AnyPathStr, **kwargs) -> AnnData:\n    \"\"\"Load an `.h5ad` file to `AnnData`.\"\"\"\n    from anndata import read_h5ad\n\n    fs, filepath_str = infer_filesystem(filepath)\n    compression = kwargs.pop(\"compression\", \"infer\")\n    with fs.open(filepath_str, mode=\"rb\", compression=compression) as file:\n        adata = read_h5ad(file, backed=False, **kwargs)\n        return adata\n\n\ndef load_h5mu(filepath: Path | str, **kwargs) -> MuData:\n    \"\"\"Load an `.h5mu` file to `MuData`.\"\"\"\n    import mudata as md\n\n    path_sanitized = Path(filepath)\n    return md.read_h5mu(path_sanitized, **kwargs)\n\n\ndef load_zarr(storepath, **kwargs):  # type: ignore\n    try:\n        from ..core.storage._zarr import load_zarr as _load_zarr\n    except ImportError:\n        raise ImportError(\"Please install zarr: pip install 'lamindb[zarr]'\") from None\n    return _load_zarr(storepath, **kwargs)\n\n\ndef load_html(path: Path | str) -> None | Path | str:\n    \"\"\"Display `.html` in ipython, otherwise return path.\"\"\"\n    if is_run_from_ipython:\n        path_sanitized = Path(path)\n        with path_sanitized.open(encoding=\"utf-8\") as f:\n            html_content = f.read()\n        # Extract the body content using regular expressions\n        body_content = re.findall(\n            r\"<body(?:.*?)>(?:.*?)</body>\", html_content, re.DOTALL\n        )\n        # Remove any empty body tags\n        if body_content:\n            body_content = body_content[0]\n            body_content = body_content.strip()  # type: ignore\n        from IPython.display import HTML, display\n\n        display(HTML(data=body_content))\n        return None\n    else:\n        return path\n\n\ndef load_json(path: Path | str) -> dict[str, Any] | list[Any]:\n    \"\"\"Load `.json` to `dict`.\"\"\"\n    import json\n\n    path_sanitized = Path(path)\n    with path_sanitized.open(encoding=\"utf-8\") as f:\n        data = json.load(f)\n    return data\n\n\ndef load_yaml(path: Path | str) -> dict[str, Any] | list[Any]:\n    \"\"\"Load `.yaml` to `dict`.\"\"\"\n    import yaml  # type: ignore\n\n    path_sanitized = Path(path)\n    with path_sanitized.open(encoding=\"utf-8\") as f:\n        data = yaml.safe_load(f)\n    return data\n\n\ndef load_image(path: Path | str) -> None | Path | str:\n    \"\"\"Display `.jpg`, `.gif` or `.png` in ipython, otherwise return path.\"\"\"\n    if is_run_from_ipython:\n        from IPython.display import Image, display\n\n        path_sanitized = Path(path)\n        display(Image(filename=path_sanitized.as_posix()))\n        return None\n    else:\n        return path\n\n\ndef load_svg(path: Path | str) -> None | Path | str:\n    \"\"\"Display `.svg` in ipython, otherwise return path.\"\"\"\n    if is_run_from_ipython:\n        from IPython.display import SVG, display\n\n        path_sanitized = Path(path)\n        display(SVG(filename=path_sanitized.as_posix()))\n        return None\n    else:\n        return path\n\n\ndef load_txt(path: Path | str) -> str:\n    \"\"\"Load `.txt` file to `str`.\"\"\"\n    path_sanitized = Path(path)\n    return path_sanitized.read_text(encoding=\"utf-8\")\n\n\ndef load_rds(path: Path | str) -> Path | str:\n    \"\"\"Just warn when trying to load `.rds`.\"\"\"\n    logger.warning(\"Please use `laminr` to load `.rds` files\")\n    return path\n\n\nFILE_LOADERS = {\n    \".csv\": load_csv,\n    \".csv.gz\": load_csv,\n    \".csv.tar.gz\": load_csv,\n    \".tsv\": load_tsv,\n    \".tsv.gz\": load_tsv,\n    \".tsv.tar.gz\": load_tsv,\n    \".h5ad\": load_h5ad,\n    \".h5ad.gz\": load_h5ad,\n    \".h5ad.tar.gz\": load_h5ad,\n    \".parquet\": load_parquet,\n    \".fcs\": load_fcs,\n    \".zarr\": load_zarr,\n    \".anndata.zarr\": load_zarr,\n    \".html\": load_html,\n    \".json\": load_json,\n    \".vitessce.json\": load_json,\n    \".yaml\": load_yaml,\n    \".h5mu\": load_h5mu,\n    \".gif\": load_image,\n    \".jpg\": load_image,\n    \".png\": load_image,\n    \".svg\": load_svg,\n    \".rds\": load_rds,\n    \".txt\": load_txt,\n    \".fasta\": load_txt,\n}\n\nSUPPORTED_SUFFIXES = [sfx for sfx in FILE_LOADERS.keys() if sfx != \".rds\"]\n\"\"\"Suffixes with defined artifact loaders.\"\"\"\n\n\ndef load_to_memory(\n    filepath: AnyPathStr, **kwargs\n) -> DataFrame | ScverseDataStructures | dict[str, Any] | list[Any] | AnyPathStr | None:\n    \"\"\"Load a file into memory.\n\n    Returns the filepath if no in-memory form is found.\n    May return None in interactive sessions for images.\n    \"\"\"\n    filepath = create_path(filepath)\n    suffix = extract_suffix_from_path(filepath)\n    loader = FILE_LOADERS.get(suffix, None)\n    if loader is None:\n        raise NotImplementedError(\n            f\"There is no loader for {suffix} files. Use .cache() to get the path.\"\n        )\n\n    filepath = setup_settings.paths.cloud_to_local(filepath, print_progress=True)\n\n    return cast(Callable[..., Any], loader)(filepath, **kwargs)\n"
  },
  {
    "path": "lamindb/core/storage/__init__.py",
    "content": "\"\"\"Storage API.\n\nValid suffixes.\n\n.. autodata:: VALID_SUFFIXES\n\nArray accessors.\n\n.. autoclass:: AnnDataAccessor\n.. autoclass:: SpatialDataAccessor\n.. autoclass:: BackedAccessor\n\"\"\"\n\nfrom typing import TYPE_CHECKING, Any\n\nfrom lamindb_setup.core.upath import LocalPathClasses, UPath, infer_filesystem\n\nfrom ._valid_suffixes import VALID_SUFFIXES\nfrom .paths import delete_storage\n\nif TYPE_CHECKING:\n    from ._anndata_accessor import AnnDataAccessor\n    from ._backed_access import BackedAccessor\n    from ._spatialdata_accessor import SpatialDataAccessor\n    from ._tiledbsoma import save_tiledbsoma_experiment\n    from .objects import infer_suffix, write_to_disk\n\n\n__all__ = [\n    \"AnnDataAccessor\",\n    \"BackedAccessor\",\n    \"LocalPathClasses\",\n    \"SpatialDataAccessor\",\n    \"UPath\",\n    \"VALID_SUFFIXES\",\n    \"delete_storage\",\n    \"infer_filesystem\",\n    \"infer_suffix\",\n    \"save_tiledbsoma_experiment\",\n    \"write_to_disk\",\n]\n\n_LAZY_EXPORTS = frozenset(\n    {\n        \"AnnDataAccessor\",\n        \"BackedAccessor\",\n        \"SpatialDataAccessor\",\n        \"infer_suffix\",\n        \"save_tiledbsoma_experiment\",\n        \"write_to_disk\",\n    }\n)\n\n\ndef __getattr__(name: str) -> Any:\n    if name not in _LAZY_EXPORTS:\n        raise AttributeError(f\"module {__name__!r} has no attribute {name!r}\")\n\n    attr: Any\n    if name == \"AnnDataAccessor\":\n        from ._anndata_accessor import AnnDataAccessor as attr\n    elif name == \"BackedAccessor\":\n        from ._backed_access import BackedAccessor as attr\n    elif name == \"SpatialDataAccessor\":\n        from ._spatialdata_accessor import SpatialDataAccessor as attr\n    elif name == \"save_tiledbsoma_experiment\":\n        from ._tiledbsoma import save_tiledbsoma_experiment as attr\n    else:\n        from .objects import infer_suffix, write_to_disk\n\n        attr = infer_suffix if name == \"infer_suffix\" else write_to_disk\n\n    globals()[name] = attr\n    return attr\n"
  },
  {
    "path": "lamindb/core/storage/_anndata_accessor.py",
    "content": "from __future__ import annotations\n\nimport inspect\nfrom functools import cached_property\nfrom importlib.metadata import version as get_version\nfrom itertools import chain\nfrom typing import TYPE_CHECKING, Callable, Literal, Union\n\nimport h5py\nimport numpy as np\nimport pandas as pd\nfrom anndata import AnnData\nfrom anndata._core.index import _normalize_indices\nfrom anndata._core.views import _resolve_idx\nfrom anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5\nfrom anndata._io.specs.registry import (\n    get_spec,\n    read_elem,\n    read_elem_partial,\n    write_elem,\n)\nfrom anndata.compat import _read_attr\nfrom fsspec.implementations.local import LocalFileSystem\nfrom fsspec.utils import infer_compression\nfrom lamin_utils import logger\nfrom lamindb_setup.core.upath import S3FSMap, infer_filesystem\nfrom packaging import version\nfrom upath import UPath\n\nif TYPE_CHECKING:\n    from collections.abc import Mapping\n\n    from fsspec.core import OpenFile\n    from lamindb_setup.types import AnyPathStr\n\n    from lamindb import Artifact\n\nanndata_version_parse = version.parse(get_version(\"anndata\"))\n\nif anndata_version_parse < version.parse(\"0.9.0\"):\n    from anndata._core.index import Index\nelse:\n    from anndata.compat import Index\n\nif anndata_version_parse < version.parse(\"0.10.0\"):\n    if anndata_version_parse < version.parse(\"0.9.1\"):\n        logger.warning(\n            \"Full backed capabilities are not available for this version of anndata,\"\n            \" please install anndata>=0.9.1.\"\n        )\n\n    from anndata._core.sparse_dataset import SparseDataset\n\n    # try csr for groups with no encoding_type\n    class CSRDataset(SparseDataset):\n        @property\n        def format_str(self) -> str:\n            return \"csr\"\n\n    def sparse_dataset(group):\n        return SparseDataset(group)\n\nelse:\n    if anndata_version_parse >= version.parse(\"0.11.0\"):\n        from anndata._core.sparse_dataset import (  # type: ignore\n            _CSRDataset as CSRDataset,\n        )\n    else:\n        from anndata._core.sparse_dataset import CSRDataset  # type: ignore\n    from anndata._core.sparse_dataset import (\n        BaseCompressedSparseDataset as SparseDataset,\n    )\n    from anndata._core.sparse_dataset import sparse_dataset  # type: ignore\n\n    def _check_group_format(*args):\n        pass\n\n    CSRDataset._check_group_format = _check_group_format\n\n\n# zarr and CSRDataset have problems with full selection\ndef _subset_sparse(sparse_ds: CSRDataset | SparseDataset, indices):\n    has_arrays = isinstance(indices[0], np.ndarray) or isinstance(\n        indices[1], np.ndarray\n    )\n    if not has_arrays and indices == (slice(None), slice(None)):\n        return sparse_ds.to_memory()\n    else:\n        return sparse_ds[indices]\n\n\ndef get_module_name(obj):\n    return inspect.getmodule(obj).__name__.partition(\".\")[0]\n\n\ndef _records_to_df(obj):\n    if isinstance(obj, pd.DataFrame):\n        return obj\n\n    if hasattr(obj, \"dtype\") and obj.dtype.names is not None:\n        formats = []\n        for name, (dt, _) in obj.dtype.fields.items():\n            if dt.char == \"S\":\n                new_dt = str(dt).replace(\"S\", \"U\")\n            else:\n                new_dt = dt\n            formats.append((name, new_dt))\n        df = pd.DataFrame(obj.astype(formats, copy=False))\n        for index_name in (\"index\", \"_index\"):\n            if index_name in df.columns:\n                return df.set_index(index_name)\n            return df\n    else:\n        return obj\n\n\nclass AccessRegistry:\n    def __init__(self):\n        self._registry = {}\n        self._openers = {}\n\n    def register_open(self, module: str):\n        def wrapper(func: Callable):\n            self._openers[module] = func\n            return func\n\n        return wrapper\n\n    def open(self, module: str, *args, **kwargs):\n        if module in self._openers:\n            return self._openers[module](*args, **kwargs)\n        else:\n            raise ValueError(f\"Module {module} not found, please install it.\")\n\n    def register(self, module: str):\n        def wrapper(func: Callable):\n            func_name = func.__name__\n            if func_name not in self._registry:\n                self._registry[func_name] = {}\n            self._registry[func_name][module] = func\n            return func\n\n        return wrapper\n\n    def __getattr__(self, func_name: str):\n        def wrapper(*args, **kwargs):\n            func_registry = self._registry[func_name]\n            for arg in chain(args, kwargs.values()):\n                arg_module = get_module_name(arg)\n                if arg_module in func_registry:\n                    return func_registry[arg_module](*args, **kwargs)\n            raise ValueError(f\"{func_name} is not registered for this module.\")\n\n        return wrapper\n\n\n# storage specific functions should be registered and called through the registry\nregistry = AccessRegistry()\n\n\n@registry.register_open(\"h5py\")\ndef open(filepath: AnyPathStr, mode: str = \"r\", compression: str | None = \"infer\"):\n    fs, file_path_str = infer_filesystem(filepath)\n    # we don't open compressed files directly because we need fsspec to uncompress on .open\n    compression = (\n        infer_compression(file_path_str) if compression == \"infer\" else compression\n    )\n    if isinstance(fs, LocalFileSystem) and compression is None:\n        assert mode in {\"r\", \"r+\", \"a\", \"w\", \"w-\"}, f\"Unknown mode {mode}!\"  #  noqa: S101\n        return None, h5py.File(file_path_str, mode=mode)\n    if mode == \"r\":\n        conn_mode = \"rb\"\n    elif mode == \"w\":\n        conn_mode = \"wb\"\n    elif mode == \"a\":\n        conn_mode = \"ab\"\n    else:\n        raise ValueError(f\"Unknown mode {mode}! Should be 'r', 'w' or 'a'.\")\n    conn = fs.open(file_path_str, mode=conn_mode, compression=compression)\n    try:\n        storage = h5py.File(conn, mode=mode)\n    except Exception as e:\n        conn.close()\n        raise e\n    return conn, storage\n\n\n@registry.register(\"h5py\")\ndef read_dataframe(elem: h5py.Dataset | h5py.Group):\n    if isinstance(elem, h5py.Dataset):\n        return read_dataframe_legacy_h5(elem)\n    else:\n        return read_elem(elem)\n\n\n@registry.register(\"h5py\")\ndef safer_read_partial(elem, indices):\n    is_dataset = isinstance(elem, h5py.Dataset)\n    indices_inverse: list | None = None\n    encoding_type = get_spec(elem).encoding_type\n    # h5py selection for datasets requires sorted indices\n    if is_dataset or encoding_type == \"dataframe\":\n        indices_increasing = []\n        indices_inverse = []\n        for indices_dim in indices:\n            # should be integer or bool\n            # ignore bool or increasing unique integers\n            if (\n                isinstance(indices_dim, np.ndarray)\n                and indices_dim.dtype != \"bool\"\n                and not np.all(np.diff(indices_dim) > 0)\n            ):\n                idx_unique, idx_inverse = np.unique(indices_dim, return_inverse=True)\n                indices_increasing.append(idx_unique)\n                indices_inverse.append(idx_inverse)\n            else:\n                indices_increasing.append(indices_dim)\n                indices_inverse.append(None)\n        indices = tuple(indices_increasing)\n        if all(idx is None for idx in indices_inverse):\n            indices_inverse = None\n    result = None\n    if encoding_type == \"\":\n        if is_dataset:\n            dims = len(elem.shape)\n            if dims == 2:\n                result = elem[indices]\n            elif dims == 1:\n                if indices[0] == slice(None):\n                    result = elem[indices[1]]\n                elif indices[1] == slice(None):\n                    result = elem[indices[0]]\n        elif isinstance(elem, h5py.Group):\n            try:\n                ds = CSRDataset(elem)\n                result = _subset_sparse(ds, indices)\n            except Exception as e:\n                logger.debug(\n                    f\"Encountered an exception while attempting to subset a sparse dataset by indices.\\n{e}\"\n                )\n        if result is None:\n            raise ValueError(\n                \"Can not get a subset of the element of type\"\n                f\" {type(elem).__name__} with an empty spec.\"\n            )\n    else:\n        result = read_elem_partial(elem, indices=indices)\n    if indices_inverse is None:\n        return result\n    else:\n        if indices_inverse[0] is None:\n            if len(result.shape) == 2:\n                return result[:, indices_inverse[1]]\n            else:\n                return result[indices_inverse[1]]\n        elif indices_inverse[1] is None:\n            if isinstance(result, pd.DataFrame):\n                return result.iloc[indices_inverse[0]]\n            else:\n                return result[indices_inverse[0]]\n        else:\n            return result[tuple(indices_inverse)]\n\n\n@registry.register(\"h5py\")\ndef keys(storage: h5py.File):\n    attrs_keys: dict[str, list] = {}\n    for attr in storage.keys():\n        if attr == \"X\":\n            continue\n        attr_obj = storage[attr]\n        if attr in (\"obs\", \"var\") and isinstance(attr_obj, h5py.Dataset):\n            keys = list(attr_obj.dtype.fields.keys())\n        else:\n            keys = list(attr_obj.keys())\n        if len(keys) > 0:\n            attrs_keys[attr] = keys\n    return attrs_keys\n\n\nArrayTypes = [h5py.Dataset]\nGroupTypes = [h5py.Group]\nStorageTypes = [h5py.File]\n\n\nZARR_INSTALLED = False\ntry:\n    import zarr\n\n    ZARR_INSTALLED = True\nexcept ImportError:\n    pass\n\nif ZARR_INSTALLED:\n    from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr\n\n    from ._zarr import IS_ZARR_V3, get_zarr_store\n\n    ArrayTypes.append(zarr.Array)\n    GroupTypes.append(zarr.Group)\n    StorageTypes.append(zarr.Group)\n\n    @registry.register_open(\"zarr\")\n    def open(filepath: AnyPathStr, mode: Literal[\"r\", \"r+\", \"a\", \"w\", \"w-\"] = \"r\"):\n        assert mode in {\"r\", \"r+\", \"a\", \"w\", \"w-\"}, f\"Unknown mode {mode}!\"  #  noqa: S101\n\n        store = get_zarr_store(filepath)\n        kwargs = {}\n        if IS_ZARR_V3 and mode != \"r\":\n            # otherwise unable to write\n            kwargs[\"use_consolidated\"] = False\n        storage = zarr.open(store, mode=mode, **kwargs)\n        # zarr v2 re-initializes the mapper\n        # we need to put back the correct one\n        # S3FSMap is returned from get_zarr_store only for zarr v2\n        if isinstance(store, S3FSMap):\n            assert not IS_ZARR_V3  # noqa: S101\n\n            storage.store.map = store\n        conn = None\n        return conn, storage\n\n    @registry.register(\"zarr\")\n    def read_dataframe(elem: Union[zarr.Array, zarr.Group]):  # noqa\n        if isinstance(elem, zarr.Array):\n            return read_dataframe_legacy_zarr(elem)\n        else:\n            return read_elem(elem)\n\n    @registry.register(\"zarr\")\n    def safer_read_partial(elem, indices):\n        encoding_type = get_spec(elem).encoding_type\n        if encoding_type == \"\":\n            if isinstance(elem, zarr.Array):\n                dims = len(elem.shape)\n                if dims == 2:\n                    return elem.oindex[indices]\n                elif dims == 1:\n                    if indices[0] == slice(None):\n                        return elem.oindex[indices[1]]\n                    elif indices[1] == slice(None):\n                        return elem.oindex[indices[0]]\n            elif isinstance(elem, zarr.Group):\n                try:\n                    ds = CSRDataset(elem)\n                    return _subset_sparse(ds, indices)\n                except Exception as e:\n                    logger.debug(\n                        f\"Encountered an exception while attempting to subset a sparse dataset by indices.\\n{e}\"\n                    )\n            raise ValueError(\n                \"Can not get a subset of the element of type\"\n                f\" {type(elem).__name__} with an empty spec.\"\n            )\n        else:\n            if encoding_type in (\"csr_matrix\", \"csc_matrix\"):\n                ds = sparse_dataset(elem)\n                return _subset_sparse(ds, indices)\n            else:\n                indices = tuple(\n                    idim.tolist()\n                    if isinstance(idim, np.ndarray) and idim.dtype == \"bool\"\n                    else idim\n                    for idim in indices\n                )\n                return read_elem_partial(elem, indices=indices)\n\n    # this is needed because accessing zarr.Group.keys() directly is very slow\n    @registry.register(\"zarr\")\n    def keys(storage: zarr.Group):\n        if IS_ZARR_V3:\n            paths = storage._sync_iter(storage.store.list())\n        else:\n            paths = storage.store.keys()\n\n        attrs_keys: dict[str, list] = {}\n        obs_var_arrays = []\n\n        prefix = storage.path\n        if prefix == \"\":\n            paths_iter = (path for path in paths)\n        else:\n            prefix += \"/\"\n            paths_iter = (\n                path.removeprefix(prefix) for path in paths if path.startswith(prefix)\n            )\n\n        for path in paths_iter:\n            if path in (\".zattrs\", \".zgroup\"):\n                continue\n            parts = path.split(\"/\")\n            if len(parts) < 2:\n                continue\n            attr = parts[0]\n            key = parts[1]\n\n            if attr == \"X\":\n                continue\n\n            if attr in (\"obs\", \"var\"):\n                if attr in obs_var_arrays:\n                    continue\n                if key == \".zarray\":\n                    attrs_keys.pop(attr, None)\n                    obs_var_arrays.append(attr)\n\n            if attr not in attrs_keys:\n                attrs_keys[attr] = []\n\n            if key in (\".zattrs\", \".zgroup\", \".zarray\"):\n                continue\n            attr_keys = attrs_keys[attr]\n            if key not in attr_keys:\n                attr_keys.append(key)\n\n        for attr in obs_var_arrays:\n            attrs_keys[attr] = list(storage[attr].dtype.fields.keys())\n\n        return {attr: keys for attr, keys in attrs_keys.items() if len(keys) > 0}\n\n\nArrayTypes = tuple(ArrayTypes)  # type: ignore\nGroupTypes = tuple(GroupTypes)  # type: ignore\nStorageTypes = tuple(StorageTypes)  # type: ignore\n\n\nArrayType = Union[ArrayTypes]  # type: ignore\nGroupType = Union[GroupTypes]  # type: ignore\nStorageType = Union[StorageTypes]  # type: ignore\n\n\ndef _to_memory(elem):\n    if isinstance(elem, ArrayTypes):\n        return elem[()]\n    elif isinstance(elem, SparseDataset):\n        return elem.to_memory()\n    else:\n        return elem\n\n\ndef _try_backed_full(elem):\n    # think what to do for compatibility with old var and obs\n    if isinstance(elem, ArrayTypes):\n        return elem\n\n    if isinstance(elem, GroupTypes):\n        encoding_type = get_spec(elem).encoding_type\n        if encoding_type in (\"csr_matrix\", \"csc_matrix\"):\n            return sparse_dataset(elem)\n        if \"h5sparse_format\" in elem.attrs:\n            return sparse_dataset(elem)\n        if encoding_type == \"\" and \"indptr\" in elem:\n            return CSRDataset(elem)\n\n    return read_elem(elem)\n\n\ndef _to_index(elem: np.ndarray):\n    if elem.dtype in (np.float64, np.int64):\n        elem = elem.astype(str)\n    return pd.Index(elem)\n\n\ndef _safer_read_index(elem):\n    if isinstance(elem, GroupTypes):\n        return _to_index(read_elem(elem[_read_attr(elem.attrs, \"_index\")]))\n    elif isinstance(elem, ArrayTypes):\n        indices = None\n        for index_name in (\"index\", \"_index\"):\n            if index_name in elem.dtype.names:\n                indices = elem[index_name]\n                break\n        if indices is not None and len(indices) > 0:\n            if isinstance(indices[0], bytes):\n                indices = np.frompyfunc(lambda x: x.decode(\"utf-8\"), 1, 1)(indices)\n            return _to_index(indices)\n        else:\n            raise ValueError(\"Indices not found.\")\n    else:\n        raise ValueError(f\"Unknown elem type {type(elem)} when reading indices.\")\n\n\nclass _MapAccessor:\n    def __init__(self, elem, name, indices=None):\n        self.elem = elem\n        self.indices = indices\n        self.name = name\n\n    def __getitem__(self, key):\n        if self.indices is None:\n            return _try_backed_full(self.elem[key])\n        else:\n            return registry.safer_read_partial(self.elem[key], indices=self.indices)\n\n    def keys(self):\n        return list(self.elem.keys())\n\n    def __repr__(self):\n        \"\"\"Description of the _MapAccessor object.\"\"\"\n        descr = f\"Accessor for the AnnData attribute {self.name}\"\n        descr += f\"\\n  with keys: {self.keys()}\"\n        return descr\n\n\ndef _safer_read_df(elem, indices=None):\n    if indices is not None:\n        obj = registry.safer_read_partial(elem, indices=indices)\n        df = _records_to_df(obj)\n    else:\n        df = registry.read_dataframe(elem)\n    if df.index.dtype in (np.float64, np.int64):\n        df.index = df.index.astype(str)\n    return df\n\n\nclass _AnnDataAttrsMixin:\n    storage: StorageType\n    _attrs_keys: Mapping[str, list]\n\n    @cached_property\n    def obs(self) -> pd.DataFrame | None:\n        if \"obs\" not in self._attrs_keys:\n            return None\n        indices = getattr(self, \"indices\", None)\n        return _safer_read_df(\n            self.storage[\"obs\"],  # type: ignore\n            indices=(indices[0], slice(None)) if indices is not None else None,\n        )\n\n    @cached_property\n    def var(self) -> pd.DataFrame | None:\n        if \"var\" not in self._attrs_keys:\n            return None\n        indices = getattr(self, \"indices\", None)\n        return _safer_read_df(\n            self.storage[\"var\"],  # type: ignore\n            indices=(indices[1], slice(None)) if indices is not None else None,\n        )\n\n    @cached_property\n    def uns(self):\n        if \"uns\" not in self._attrs_keys:\n            return None\n        return read_elem(self.storage[\"uns\"])\n\n    @cached_property\n    def X(self):\n        indices = getattr(self, \"indices\", None)\n        if indices is not None:\n            return registry.safer_read_partial(self.storage[\"X\"], indices=indices)\n        else:\n            return _try_backed_full(self.storage[\"X\"])\n\n    @cached_property\n    def obsm(self):\n        if \"obsm\" not in self._attrs_keys:\n            return None\n        indices = getattr(self, \"indices\", None)\n        if indices is not None:\n            indices = (indices[0], slice(None))\n        return _MapAccessor(self.storage[\"obsm\"], \"obsm\", indices)\n\n    @cached_property\n    def varm(self):\n        if \"varm\" not in self._attrs_keys:\n            return None\n        indices = getattr(self, \"indices\", None)\n        if indices is not None:\n            indices = (indices[1], slice(None))\n        return _MapAccessor(self.storage[\"varm\"], \"varm\", indices)\n\n    @cached_property\n    def obsp(self):\n        if \"obsp\" not in self._attrs_keys:\n            return None\n        indices = getattr(self, \"indices\", None)\n        if indices is not None:\n            indices = (indices[0], indices[0])\n        return _MapAccessor(self.storage[\"obsp\"], \"obsp\", indices)\n\n    @cached_property\n    def varp(self):\n        if \"varp\" not in self._attrs_keys:\n            return None\n        indices = getattr(self, \"indices\", None)\n        if indices is not None:\n            indices = (indices[1], indices[1])\n        return _MapAccessor(self.storage[\"varp\"], \"varp\", indices)\n\n    @cached_property\n    def layers(self):\n        if \"layers\" not in self._attrs_keys:\n            return None\n        indices = getattr(self, \"indices\", None)\n        return _MapAccessor(self.storage[\"layers\"], \"layers\", indices)\n\n    @property\n    def obs_names(self):\n        return self._obs_names\n\n    @property\n    def var_names(self):\n        return self._var_names\n\n    @cached_property\n    def shape(self):\n        return len(self._obs_names), len(self._var_names)\n\n    def to_dict(self):\n        prepare_adata = {}\n\n        prepare_adata[\"X\"] = _to_memory(self.X)\n\n        if \"uns\" in self._attrs_keys:\n            prepare_adata[\"uns\"] = self.uns\n\n        for attr in (\"obs\", \"var\"):\n            if attr in self._attrs_keys:\n                prepare_adata[attr] = getattr(self, attr)\n\n        for attr in (\"obsm\", \"varm\", \"obsp\", \"varp\", \"layers\"):\n            if attr in self._attrs_keys:\n                prepare_adata[attr] = {}\n                get_attr = getattr(self, attr)\n                for key in self._attrs_keys[attr]:\n                    prepare_adata[attr][key] = _to_memory(get_attr[key])\n\n        if \"raw\" in self._attrs_keys:\n            prepare_adata[\"raw\"] = self.raw.to_dict()\n\n        return prepare_adata\n\n    def to_memory(self):\n        adata = AnnData(**self.to_dict())\n        return adata\n\n\nclass AnnDataAccessorSubset(_AnnDataAttrsMixin):\n    def __init__(self, storage, indices, attrs_keys, obs_names, var_names, ref_shape):\n        self.storage = storage\n        self.indices = indices\n\n        self._attrs_keys = attrs_keys\n        self._obs_names, self._var_names = obs_names, var_names\n\n        self._ref_shape = ref_shape\n\n    def __getitem__(self, index: Index):\n        \"\"\"Access a subset of the underlying AnnData object.\"\"\"\n        oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)\n        new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]\n        if self.indices is not None:\n            oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0])\n            vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1])\n        return type(self)(\n            self.storage,\n            (oidx, vidx),\n            self._attrs_keys,\n            new_obs_names,\n            new_var_names,\n            self._ref_shape,\n        )\n\n    def __repr__(self):\n        \"\"\"Description of the object.\"\"\"\n        n_obs, n_vars = self.shape\n        descr = f\"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}\"\n        for attr, keys in self._attrs_keys.items():\n            descr += f\"\\n  {attr}: {keys}\"\n        return descr\n\n    @cached_property\n    def raw(self):\n        if \"raw\" not in self._attrs_keys:\n            return None\n        prepare_indices = None\n        if self.indices is not None:\n            oidx = self.indices[0]\n            if isinstance(oidx, np.ndarray) or oidx != slice(None):\n                prepare_indices = oidx, slice(None)\n        return AnnDataRawAccessor(\n            self.storage[\"raw\"],\n            prepare_indices,\n            None,\n            self._obs_names,\n            None,\n            self._ref_shape[0],\n        )\n\n\nclass AnnDataRawAccessor(AnnDataAccessorSubset):\n    def __init__(\n        self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape\n    ):\n        var_raw = storage_raw[\"var\"]\n\n        if var_names is None:\n            var_names = _safer_read_index(var_raw)\n\n        if isinstance(ref_shape, int):\n            ref_shape = ref_shape, len(var_names)\n        elif isinstance(ref_shape, tuple) and len(ref_shape) < 2:\n            ref_shape = ref_shape[0], len(var_names)\n\n        if attrs_keys is None:\n            attrs_keys = {}\n            if isinstance(var_raw, ArrayTypes):\n                attrs_keys[\"var\"] = list(var_raw.dtype.fields.keys())\n            else:\n                # for some reason list(var_raw.keys()) is very slow for zarr\n                # maybe also directly get keys from the underlying mapper\n                attrs_keys[\"var\"] = list(var_raw)\n            if \"varm\" in storage_raw:\n                varm_keys_raw = list(storage_raw[\"varm\"])\n                if len(varm_keys_raw) > 0:\n                    attrs_keys[\"varm\"] = varm_keys_raw\n\n        super().__init__(\n            storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape\n        )\n\n    @property\n    def raw(self):\n        raise AttributeError\n\n\nclass AnnDataAccessor(_AnnDataAttrsMixin):\n    \"\"\"Cloud-backed AnnData.\"\"\"\n\n    def __init__(\n        self,\n        connection: OpenFile | None,\n        storage: StorageType,\n        filename: str,\n        artifact: Artifact | None = None,\n    ):\n        self._conn = connection\n        self.storage = storage\n\n        self._attrs_keys = registry.keys(self.storage)\n\n        self._name = filename\n\n        self._obs_names = _safer_read_index(self.storage[\"obs\"])  # type: ignore\n        self._var_names = _safer_read_index(self.storage[\"var\"])  # type: ignore\n\n        self._artifact = artifact  # save artifact to update in write mode\n\n        self._updated = False  # track updates in r+ mode for zarr\n\n        self._entered = False  # check that the context manager is used\n        self._closed = False\n\n    def close(self):\n        \"\"\"Closes the connection.\"\"\"\n        storage = self.storage\n        connection = self._conn\n\n        if self._updated and (artifact := self._artifact) is not None:\n            from lamindb.models.artifact import Artifact\n            from lamindb.models.sqlrecord import init_self_from_db\n\n            # now self._updated can only be True for zarr\n            assert ZARR_INSTALLED  # noqa: S101\n\n            store = storage.store\n            keys = storage._sync_iter(store.list()) if IS_ZARR_V3 else store.keys()\n            # this checks that there consolidated metadata was written before\n            # need to update it\n            # zmetadata is in spatialdata sometimes for some reason\n            if \".zmetadata\" in keys or \"zmetadata\" in keys:\n                zarr.consolidate_metadata(store)\n\n            new_version = Artifact(\n                artifact.path, revises=artifact, _is_internal_call=True\n            ).save()\n            # note: sets _state.db = \"default\"\n            init_self_from_db(artifact, new_version)\n\n        if hasattr(storage, \"close\"):\n            storage.close()\n        if hasattr(connection, \"close\"):\n            connection.close()\n        self._closed = True\n\n    @property\n    def closed(self):\n        return self._closed\n\n    def __enter__(self):\n        self._entered = True\n\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        self.close()\n\n    def __getitem__(self, index: Index) -> AnnDataAccessorSubset:\n        \"\"\"Access a subset of the underlying AnnData object.\"\"\"\n        oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)\n        new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]\n        return AnnDataAccessorSubset(\n            self.storage,\n            (oidx, vidx),\n            self._attrs_keys,\n            new_obs_names,\n            new_var_names,\n            self.shape,\n        )\n\n    def __repr__(self):\n        \"\"\"Description of the AnnDataAccessor object.\"\"\"\n        n_obs, n_vars = self.shape\n        descr = f\"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}\"\n        descr += f\"\\n  constructed for the AnnData object {self._name}\"\n        for attr, keys in self._attrs_keys.items():\n            descr += f\"\\n    {attr}: {keys}\"\n        return descr\n\n    @cached_property\n    def raw(self):\n        if \"raw\" not in self._attrs_keys:\n            return None\n        return AnnDataRawAccessor(\n            self.storage[\"raw\"], None, None, self._obs_names, None, self.shape[0]\n        )\n\n    def add_column(\n        self,\n        where: Literal[\"obs\", \"var\"],\n        col_name: str,\n        col: np.ndarray | pd.Categorical,\n    ):\n        \"\"\"Add a new column to .obs or .var of the underlying AnnData object.\"\"\"\n        df_store = self.storage[where]  # type: ignore\n        if getattr(df_store, \"read_only\", True):\n            raise ValueError(\n                \"You can use .add_column(...) only with zarr in a writable mode.\"\n            )\n        write_elem(df_store, col_name, col)\n        df_store.attrs[\"column-order\"] = df_store.attrs[\"column-order\"] + [col_name]\n        # remind only once if this wasn't updated before and not in the context manager\n        if not self._updated and not self._entered and self._artifact is not None:\n            logger.important(\n                \"Do not forget to call .close() after you finish \"\n                f\"working with this accessor for {self._name} \"\n                \"to automatically update the corresponding artifact.\"\n            )\n\n        self._updated = True\n        # reset the cached property\n        # todo: maybe just append the column if the df was already loaded\n        self.__dict__.pop(where, None)\n        # update the cached columns\n        self._attrs_keys[where].append(col_name)\n\n\n# get the number of observations in an anndata object or file fast and safely\ndef _anndata_n_observations(object: AnyPathStr | AnnData) -> int | None:\n    if isinstance(object, AnnData):\n        return object.n_obs\n\n    try:\n        objectpath = UPath(object)\n        conn_module = None\n        if \".h5ad\" in objectpath.suffixes:\n            conn_module = \"h5py\"\n        elif objectpath.suffix == \".zarr\":\n            conn_module = \"zarr\"\n        conn, storage = registry.open(conn_module, objectpath, mode=\"r\")\n    except Exception as e:\n        logger.warning(f\"Could not open {object} to read n_observations: {e}\")\n        return None\n\n    n_observations: int | None = None\n    try:\n        obs = storage[\"obs\"]\n        if isinstance(obs, GroupTypes):  # type: ignore\n            if \"_index\" in obs.attrs:\n                elem_key = _read_attr(obs.attrs, \"_index\")\n            else:\n                elem_key = next(iter(obs))\n            elem = obs[elem_key]\n            if isinstance(elem, ArrayTypes):  # type: ignore\n                n_observations = elem.shape[0]\n            else:\n                # assume standard obs group\n                n_observations = elem[\"codes\"].shape[0]\n        else:\n            n_observations = obs.shape[0]\n    except Exception as e:\n        logger.warning(f\"Could not read n_observations from anndata {object}: {e}\")\n    finally:\n        if hasattr(storage, \"close\"):\n            storage.close()\n        if hasattr(conn, \"close\"):\n            conn.close()\n    return n_observations\n"
  },
  {
    "path": "lamindb/core/storage/_backed_access.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Any, Callable, Literal\n\nPYARROW_SUFFIXES = (\".parquet\", \".csv\", \".json\", \".orc\", \".arrow\", \".feather\", \".ipc\")\nPOLARS_SUFFIXES = (\".parquet\", \".csv\", \".ndjson\", \".ipc\")\n\nif TYPE_CHECKING:\n    from collections.abc import Iterator\n\n    from fsspec.core import OpenFile\n    from polars import LazyFrame as PolarsLazyFrame\n    from pyarrow.dataset import Dataset as PyArrowDataset\n    from tiledbsoma import Collection as SOMACollection\n    from tiledbsoma import Experiment as SOMAExperiment\n    from tiledbsoma import Measurement as SOMAMeasurement\n    from upath import UPath\n\n    from lamindb.models.artifact import Artifact\n\n    from ._anndata_accessor import AnnDataAccessor, StorageType\n    from ._spatialdata_accessor import SpatialDataAccessor\n\n\n# this dynamically creates a subclass of a context manager class\n# and reassigns it to an instance of the superclass\n# so that the instance calls finalize on close or exit\ndef _track_writes_factory(obj: Any, finalize: Callable):\n    closed: bool = False\n\n    tracked_class = obj.__class__\n    type_dict = {\"__doc__\": tracked_class.__doc__}\n    if hasattr(tracked_class, \"__slots__\"):\n        type_dict[\"__slots__\"] = ()\n    if hasattr(tracked_class, \"__exit__\"):\n\n        def __exit__(self, exc_type, exc_val, exc_tb):\n            nonlocal closed\n            tracked_class.__exit__(self, exc_type, exc_val, exc_tb)\n            if not closed:\n                finalize()\n                closed = True\n\n        type_dict[\"__exit__\"] = __exit__\n    if hasattr(tracked_class, \"close\"):\n\n        def close(self, *args, **kwargs):\n            nonlocal closed\n            tracked_class.close(self, *args, **kwargs)\n            if not closed:\n                finalize()\n                closed = True\n\n        type_dict[\"close\"] = close\n\n    Track = type(tracked_class.__name__ + \"Track\", (tracked_class,), type_dict)\n    obj.__class__ = Track\n    return obj\n\n\n@dataclass\nclass BackedAccessor:\n    \"\"\"h5py.File or zarr.Group accessor.\"\"\"\n\n    connection: OpenFile\n    \"\"\"The connection.\"\"\"\n    storage: StorageType\n    \"\"\"The storage access.\"\"\"\n\n\ndef backed_access(\n    artifact_or_filepath: Artifact | UPath,\n    mode: str = \"r\",\n    engine: Literal[\"pyarrow\", \"polars\"] = \"pyarrow\",\n    using_key: str | None = None,\n    **kwargs,\n) -> (\n    AnnDataAccessor\n    | SpatialDataAccessor\n    | BackedAccessor\n    | SOMACollection\n    | SOMAExperiment\n    | SOMAMeasurement\n    | PyArrowDataset\n    | Iterator[PolarsLazyFrame]\n):\n    from lamindb.models import Artifact\n\n    from .paths import filepath_from_artifact\n\n    if isinstance(artifact_or_filepath, Artifact):\n        artifact = artifact_or_filepath\n        objectpath, _ = filepath_from_artifact(artifact, using_key=using_key)\n    else:\n        artifact = None\n        objectpath = artifact_or_filepath\n    name = objectpath.name\n    suffix = objectpath.suffix\n    non_gz_suffix = _non_gz_suffix(objectpath.suffixes)\n\n    if name == \"soma\" or suffix == \".tiledbsoma\":\n        if mode not in {\"r\", \"w\"}:\n            raise ValueError(\"`mode` should be either 'r' or 'w' for tiledbsoma.\")\n        from ._tiledbsoma import _open_tiledbsoma\n\n        return _open_tiledbsoma(objectpath, mode=mode, **kwargs)  # type: ignore\n    elif non_gz_suffix in {\".h5\", \".hdf5\", \".h5ad\"}:\n        from ._anndata_accessor import registry\n\n        conn, storage = registry.open(\"h5py\", objectpath, mode=mode, **kwargs)\n    elif suffix == \".zarr\":\n        from ._anndata_accessor import registry\n\n        if mode not in {\"r\", \"r+\"}:\n            raise ValueError(\"`mode` should be either 'r' or 'r+' for zarr.\")\n        conn, storage = registry.open(\"zarr\", objectpath, mode=mode, **kwargs)\n        if \"spatialdata_attrs\" in storage.attrs:\n            from ._spatialdata_accessor import SpatialDataAccessor\n\n            return SpatialDataAccessor(storage, name, artifact)\n    elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and (\n        df_suffix := df_suffixes.pop()\n    ) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES):\n        return _open_dataframe(objectpath, df_suffix, engine, **kwargs)\n    else:\n        raise ValueError(\n            \"The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix \"\n            f\"be compatible with pyarrow.dataset.dataset or polars.scan_* functions, \"\n            f\"instead of being {suffix} object.\"\n        )\n\n    import h5py\n    from anndata._io.specs.registry import get_spec\n\n    from ._anndata_accessor import AnnDataAccessor\n\n    is_anndata = (\n        non_gz_suffix == \".h5ad\" or get_spec(storage).encoding_type == \"anndata\"\n    )\n    if is_anndata:\n        if mode != \"r\" and isinstance(storage, h5py.Group):\n            raise ValueError(\"Can only access `hdf5` `AnnData` with mode='r'.\")\n        return AnnDataAccessor(conn, storage, name, artifact)\n    else:\n        return BackedAccessor(conn, storage)\n\n\ndef _non_gz_suffix(suffixes: list[str]) -> str:\n    len_suffixes = len(suffixes)\n    if len_suffixes == 0:\n        return \"\"\n    if len_suffixes > 1 and \".gz\" in suffixes:\n        if (suffix := suffixes[-2]) != \".tar\":\n            return suffix\n        elif len_suffixes > 2:\n            return suffixes[-3]\n    return suffixes[-1]\n\n\ndef _flat_suffixes(paths: UPath | list[UPath]) -> set[str]:\n    # it is assumed here that the paths exist\n    # we don't check here that the filesystem is the same\n    # but this is a requirement for pyarrow.dataset.dataset\n    path_list = []\n    paths_list = paths if isinstance(paths, list) else [paths]\n    for path in paths_list:\n        # assume http is always a file\n        if path.protocol not in {\"http\", \"https\"} and path.is_dir():\n            path_list += [p for p in path.rglob(\"*\") if p.suffix != \"\"]\n        else:\n            path_list.append(path)\n\n    return {path.suffix for path in path_list}\n\n\ndef _open_dataframe(\n    paths: UPath | list[UPath],\n    suffix: str | None = None,\n    engine: Literal[\"pyarrow\", \"polars\"] = \"pyarrow\",\n    **kwargs,\n) -> PyArrowDataset | Iterator[PolarsLazyFrame]:\n    from ._polars_lazy_df import POLARS_SUFFIXES, _open_polars_lazy_df\n    from ._pyarrow_dataset import PYARROW_SUFFIXES, _open_pyarrow_dataset\n\n    if engine not in {\"pyarrow\", \"polars\"}:\n        raise ValueError(\n            f\"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'.\"\n        )\n\n    df_suffix: str\n    if suffix is None:\n        df_suffixes = _flat_suffixes(paths)\n        if len(df_suffixes) > 1:\n            raise ValueError(\n                f\"The artifacts in the collection have different file formats: {', '.join(df_suffixes)}.\\n\"\n                \"It is not possible to open such stores with pyarrow or polars.\"\n            )\n        df_suffix = df_suffixes.pop()\n    else:\n        df_suffix = suffix\n\n    if engine == \"pyarrow\" and df_suffix not in PYARROW_SUFFIXES:\n        raise ValueError(\n            f\"{df_suffix} files are not supported by pyarrow, \"\n            f\"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}.\"\n        )\n    elif engine == \"polars\" and df_suffix not in POLARS_SUFFIXES:\n        raise ValueError(\n            f\"{df_suffix} files are not supported by polars, \"\n            f\"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}.\"\n        )\n\n    polars_without_fsspec = engine == \"polars\" and not kwargs.get(\"use_fsspec\", False)\n    paths_list = paths if isinstance(paths, list) else [paths]\n    if (engine == \"pyarrow\" or polars_without_fsspec) and len(paths_list) > 1:\n        # this checks that the filesystem is the same for all paths\n        # this is a requirement of pyarrow.dataset.dataset\n        fs = paths_list[0].fs\n        for path in paths_list[1:]:\n            # this assumes that the filesystems are cached by fsspec\n            if path.fs is not fs:\n                engine_msg = (\n                    \"polars engine without passing `use_fsspec=True`\"\n                    if engine == \"polars\"\n                    else \"pyarrow engine\"\n                )\n                raise ValueError(\n                    \"The collection has artifacts with different filesystems, \"\n                    f\"this is not supported for {engine_msg}.\"\n                )\n\n    return (\n        _open_pyarrow_dataset(paths, **kwargs)\n        if engine == \"pyarrow\"\n        else _open_polars_lazy_df(paths, **kwargs)\n    )\n"
  },
  {
    "path": "lamindb/core/storage/_polars_lazy_df.py",
    "content": "from __future__ import annotations\n\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING\n\nfrom lamindb_setup.core.upath import _ensure_sync_with_fs, get_storage_region\n\nif TYPE_CHECKING:\n    from collections.abc import Iterator\n\n    from polars import LazyFrame as PolarsLazyFrame\n    from upath import UPath\n\nPOLARS_SUFFIXES = (\".parquet\", \".csv\", \".ndjson\", \".ipc\")\n\n\ndef _polars_options(storepath: UPath) -> dict:\n    polars_options: dict = {}\n    storage_options: dict[str, str | bool] = {}\n\n    fs = storepath.fs\n    fs.connect()\n\n    endpoint_url = fs.endpoint_url\n    if endpoint_url is not None:\n        storage_options[\"aws_virtual_hosted_style_request\"] = False\n        storage_options[\"aws_endpoint_url\"] = endpoint_url\n        if endpoint_url.startswith(\"http://\"):\n            storage_options[\"aws_allow_http\"] = True\n    else:\n        storage_options[\"aws_region\"] = get_storage_region(storepath)\n\n    if fs.anon:\n        storage_options[\"aws_skip_signature\"] = True\n    else:\n        aws_key = fs.key\n        aws_secret = fs.secret\n        aws_token = fs.token\n        if aws_key is not None and aws_secret is not None:\n            storage_options[\"aws_access_key_id\"] = aws_key\n            storage_options[\"aws_secret_access_key\"] = aws_secret\n            if aws_token is not None:\n                storage_options[\"aws_session_token\"] = aws_token\n        else:\n            from aiobotocore.credentials import AioRefreshableCredentials\n\n            if isinstance(\n                refreshable_credentials := fs.session._credentials,\n                AioRefreshableCredentials,\n            ):\n                refresh_sync = _ensure_sync_with_fs(\n                    refreshable_credentials._refresh, fs\n                )\n\n                def credential_provider_fn():\n                    # refresh and access the credentials\n                    refresh_sync()\n                    expiry_time = refreshable_credentials._expiry_time\n                    return {\n                        \"aws_access_key_id\": refreshable_credentials._access_key,\n                        \"aws_secret_access_key\": refreshable_credentials._secret_key,\n                        \"aws_session_token\": refreshable_credentials._token,\n                    }, int(expiry_time.timestamp()) if expiry_time is not None else None\n\n                polars_options[\"credential_provider\"] = credential_provider_fn\n\n    polars_options[\"storage_options\"] = storage_options\n\n    return polars_options\n\n\n@contextmanager\ndef _open_polars_lazy_df(\n    paths: UPath | list[UPath], use_fsspec: bool = False, **kwargs\n) -> Iterator[PolarsLazyFrame]:\n    try:\n        import polars as pl\n    except ImportError as ie:\n        raise ImportError(\"Please install polars: pip install polars\") from ie\n\n    scans = {\n        \".parquet\": pl.scan_parquet,\n        \".csv\": pl.scan_csv,\n        \".ndjson\": pl.scan_ndjson,\n        \".ipc\": pl.scan_ipc,\n    }\n\n    path_list = []\n    paths_list = paths if isinstance(paths, list) else [paths]\n    for path in paths_list:\n        # assume http is always a file\n        if path.protocol not in {\"http\", \"https\"} and path.is_dir():\n            path_list += [p for p in path.rglob(\"*\") if p.suffix != \"\"]\n        else:\n            path_list.append(path)\n    # assume the filesystem is the same for all\n    # it is checked in _open_dataframe\n    path0 = path_list[0]\n    if (\n        not use_fsspec\n        and path0.protocol == \"s3\"\n        and \"storage_options\" not in kwargs\n        and \"credential_provider\" not in kwargs\n    ):\n        kwargs.update(_polars_options(path0))\n\n    open_files = []\n\n    try:\n        for path in path_list:\n            open_files.append(path.open(mode=\"rb\") if use_fsspec else path.as_posix())\n\n        yield scans[path_list[0].suffix](open_files, **kwargs)\n    finally:\n        if use_fsspec:\n            for open_file in open_files:\n                open_file.close()\n"
  },
  {
    "path": "lamindb/core/storage/_pyarrow_dataset.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nimport pyarrow.dataset\nfrom lamindb_setup.core.upath import LocalPathClasses\n\nif TYPE_CHECKING:\n    from pyarrow.dataset import Dataset as PyArrowDataset\n    from upath import UPath\n\n\nPYARROW_SUFFIXES = (\".parquet\", \".csv\", \".json\", \".orc\", \".arrow\", \".feather\", \".ipc\")\n\n\ndef _open_pyarrow_dataset(paths: UPath | list[UPath], **kwargs) -> PyArrowDataset:\n    if isinstance(paths, list):\n        # a single path can be a directory, but a list of paths\n        # has to be a flat list of files\n        paths_str = []\n        path0 = paths[0]\n        if isinstance(path0, LocalPathClasses):\n            path_to_str = lambda p: p.as_posix()\n            filesystem = None\n        else:\n            path_to_str = lambda p: p.path\n            filesystem = path0.fs\n        for path in paths:\n            if (\n                getattr(path, \"protocol\", None) not in {\"http\", \"https\"}\n                and path.is_dir()\n            ):\n                paths_str += [path_to_str(p) for p in path.rglob(\"*\") if p.suffix != \"\"]\n            else:\n                paths_str.append(path_to_str(path))\n    elif isinstance(paths, LocalPathClasses):\n        paths_str, filesystem = paths.as_posix(), None\n    else:\n        paths_str, filesystem = paths.path, paths.fs\n\n    return pyarrow.dataset.dataset(paths_str, filesystem=filesystem, **kwargs)\n"
  },
  {
    "path": "lamindb/core/storage/_spatialdata_accessor.py",
    "content": "from __future__ import annotations\n\nfrom functools import cached_property\nfrom typing import TYPE_CHECKING\n\nfrom ._anndata_accessor import AnnDataAccessor\n\nif TYPE_CHECKING:\n    from zarr import Group\n\n    from lamindb import Artifact\n\n\nclass _TablesAccessor:\n    def __init__(self, tables: Group, artifact: Artifact | None = None):\n        self._tables = tables\n\n        self._artifact = artifact\n\n    def __getitem__(self, key: str) -> AnnDataAccessor:\n        return AnnDataAccessor(\n            connection=None,\n            storage=self._tables[key],\n            filename=key,\n            artifact=self._artifact,\n        )\n\n    def keys(self) -> list[str]:\n        return list(self._tables.keys())\n\n    def __repr__(self) -> str:\n        \"\"\"Description of the _TablesAccessor object.\"\"\"\n        descr = (\n            f\"Accessor for the SpatialData attribute tables\\n  with keys: {self.keys()}\"\n        )\n        return descr\n\n\nclass SpatialDataAccessor:\n    \"\"\"Cloud-backed SpatialData.\n\n    For now only allows to access `tables`.\n    \"\"\"\n\n    def __init__(self, storage: Group, name: str, artifact: Artifact | None = None):\n        self.storage = storage\n        self._name = name\n\n        self._artifact = artifact\n\n    @cached_property\n    def tables(self) -> _TablesAccessor:\n        \"\"\"tables of the underlying SpatialData object.\"\"\"\n        return _TablesAccessor(self.storage[\"tables\"], self._artifact)\n\n    def __repr__(self):\n        \"\"\"Description of the SpatialDataAccessor object.\"\"\"\n        descr = (\n            \"SpatialDataAccessor object\"\n            f\"\\n  constructed for the SpatialData object {self._name}\"\n            f\"\\n    with tables: {self.tables.keys()}\"\n        )\n        return descr\n"
  },
  {
    "path": "lamindb/core/storage/_tiledbsoma.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Literal\nfrom urllib.parse import urlparse\n\nimport pandas as pd\nimport pyarrow as pa\nfrom anndata import AnnData, read_h5ad\nfrom lamin_utils import logger\nfrom lamindb_setup import settings as setup_settings\nfrom lamindb_setup.core.upath import (\n    LocalPathClasses,\n    _ensure_sync_with_fs,\n    create_path,\n    get_storage_region,\n)\nfrom packaging import version\n\nif TYPE_CHECKING:\n    from lamindb_setup.types import AnyPathStr\n    from tiledbsoma import Collection as SOMACollection\n    from tiledbsoma import Experiment as SOMAExperiment\n    from tiledbsoma import Measurement as SOMAMeasurement\n    from tiledbsoma import SOMATileDBContext\n    from upath import UPath\n\n    from lamindb.models.artifact import Artifact\n    from lamindb.models.run import Run\n\n\ndef _load_h5ad_zarr(objpath: UPath):\n    from lamindb.core.loaders import load_h5ad, load_zarr\n\n    if objpath.is_dir():\n        adata = load_zarr(objpath, expected_type=\"anndata\")\n    else:\n        # read only local in backed for now\n        # in principle possible to read remote in backed also\n        if isinstance(objpath, LocalPathClasses):\n            adata = read_h5ad(objpath.as_posix(), backed=\"r\")\n        else:\n            adata = load_h5ad(objpath)\n    return adata\n\n\nclass SOMAS3ContextFactory:\n    \"\"\"Prepares and caches soma.SOMATileDBContext for a given storepath.\n\n    For S3 storage with federated credentials, credentials are\n    read and refreshed only when the store is opened—i.e. when\n    :meth:`get_context` is called as part of opening the TileDB-SOMA store.\n    They are not updated while a store handle is held open. If credentials\n    expire during a long-lived session, close the store and open it again to\n    refresh.\n    \"\"\"\n\n    def __init__(self, storepath: UPath):\n        from tiledbsoma import SOMATileDBContext\n\n        self._refreshable_credentials = None\n\n        fs = storepath.fs\n        fs.connect()\n        self._fs = fs\n\n        tiledb_config = {}\n\n        endpoint_url = fs.endpoint_url\n        if endpoint_url is not None:\n            tiledb_config[\"vfs.s3.region\"] = \"\"\n            tiledb_config[\"vfs.s3.use_virtual_addressing\"] = \"false\"\n            parsed = urlparse(endpoint_url)\n            tiledb_config[\"vfs.s3.scheme\"] = parsed.scheme\n            tiledb_config[\"vfs.s3.endpoint_override\"] = (\n                parsed._replace(scheme=\"\").geturl().lstrip(\"/\")\n            )\n        else:\n            tiledb_config[\"vfs.s3.region\"] = get_storage_region(storepath)\n\n        if fs.anon:\n            tiledb_config[\"vfs.s3.no_sign_request\"] = \"true\"\n            tiledb_config[\"vfs.s3.aws_access_key_id\"] = \"\"\n            tiledb_config[\"vfs.s3.aws_secret_access_key\"] = \"\"\n            tiledb_config[\"vfs.s3.aws_session_token\"] = \"\"\n        else:\n            aws_key = fs.key\n            aws_secret = fs.secret\n            aws_token = fs.token\n            if aws_key is not None and aws_secret is not None:\n                tiledb_config[\"vfs.s3.aws_access_key_id\"] = aws_key\n                tiledb_config[\"vfs.s3.aws_secret_access_key\"] = aws_secret\n                if aws_token is not None:\n                    tiledb_config[\"vfs.s3.aws_session_token\"] = aws_token\n            else:\n                from aiobotocore.credentials import AioRefreshableCredentials\n\n                if isinstance(\n                    refreshable_credentials := fs.session._credentials,\n                    AioRefreshableCredentials,\n                ):\n                    self._refreshable_credentials = refreshable_credentials\n                    tiledb_config.update(self._extract_refreshable_credentials())\n\n        self._context = SOMATileDBContext(tiledb_config=tiledb_config)\n\n    def _extract_refreshable_credentials(self) -> dict:\n        tiledb_config: dict[str, str] = {}\n\n        refreshable_credentials = self._refreshable_credentials\n        if refreshable_credentials is None:\n            return tiledb_config\n        # refresh and retrieve the credentials\n        _ensure_sync_with_fs(refreshable_credentials._refresh, self._fs)()\n        tiledb_config[\"vfs.s3.aws_access_key_id\"] = refreshable_credentials._access_key\n        tiledb_config[\"vfs.s3.aws_secret_access_key\"] = (\n            refreshable_credentials._secret_key\n        )\n        if (aws_token := refreshable_credentials._token) is not None:\n            tiledb_config[\"vfs.s3.aws_session_token\"] = aws_token\n\n        return tiledb_config\n\n    def get_context(self) -> SOMATileDBContext:\n        # update the credentials if needed and return the updated context\n        refreshed_credentials = self._extract_refreshable_credentials()\n        if refreshed_credentials:\n            self._context = self._context.replace(tiledb_config=refreshed_credentials)\n\n        return self._context\n\n\ndef _open_tiledbsoma(\n    storepath: UPath, mode: Literal[\"r\", \"w\"] = \"r\"\n) -> SOMACollection | SOMAExperiment | SOMAMeasurement:\n    \"\"\"Open a TileDB-SOMA store for the given path.\n\n    For S3 paths with federated credentials, credentials are refreshed at\n    open time only (see :class:`SOMAS3ContextFactory`).\n    \"\"\"\n    try:\n        import tiledbsoma as soma\n    except ImportError as e:\n        raise ImportError(\"Please install tiledbsoma: pip install tiledbsoma\") from e\n\n    storepath_str = storepath.as_posix()\n    if storepath.protocol == \"s3\":\n        ctx = SOMAS3ContextFactory(storepath).get_context()\n        # this is a strange bug\n        # for some reason iterdir futher gives incorrect results\n        # if cache is not invalidated\n        # instead of obs and ms it gives ms and ms in the list of names\n        storepath.fs.invalidate_cache()\n    else:\n        ctx = None\n\n    soma_objects = [obj.name for obj in storepath.iterdir()]\n    if \"obs\" in soma_objects and \"ms\" in soma_objects:\n        SOMAType = soma.Experiment\n    elif \"var\" in soma_objects:\n        SOMAType = soma.Measurement\n    else:\n        SOMAType = soma.Collection\n    return SOMAType.open(storepath_str, mode=mode, context=ctx)\n\n\ndef save_tiledbsoma_experiment(\n    # Artifact args\n    adatas: list[AnnData | AnyPathStr],\n    key: str | None = None,\n    description: str | None = None,\n    run: Run | None = None,\n    revises: Artifact | None = None,\n    # tiledbsoma.io.from_anndata args\n    measurement_name: str = \"RNA\",\n    obs_id_name: str = \"obs_id\",\n    var_id_name: str = \"var_id\",\n    append_obsm_varm: bool = False,\n    # additional keyword args for tiledbsoma.io.from_anndata\n    **kwargs,\n) -> Artifact:\n    \"\"\"Write `AnnData` to `tiledbsoma.Experiment`.\n\n    Reads `AnnData` objects, writes them to `tiledbsoma.Experiment`, creates & saves an :class:`~lamindb.Artifact`.\n\n    Populates a column `lamin_run_uid` column in `obs` with the current `run.uid`.\n\n    Is based on `tiledbsoma.io.from_anndata\n    <https://tiledbsoma.readthedocs.io/en/latest/_autosummary/tiledbsoma.io.from_anndata.html>`__.\n\n    Args:\n        adatas: `AnnData` objects to write, in-memory or on-disk.\n        key: An optional key to reference the artifact.\n        description: A description.\n        run: The run that creates the artifact.\n        revises: `lamindb.Artifact` with `tiledbsoma.Experiment` to append to.\n        measurement_name: The name of the measurement to store data in `tiledbsoma.Experiment`.\n        obs_id_name: Which `AnnData` `obs` column to use for append mode.\n        var_id_name: Which `AnnData` `var` column to use for append mode.\n        append_obsm_varm: Whether to append `obsm` and `varm` in append mode .\n        **kwargs: Keyword arguments passed to `tiledbsoma.io.from_anndata`.\n\n    Note:\n        For S3 storage with federated credentials, credentials are\n        updated only when the store is opened for each write step, not while a\n        store handle is held open. Retry if credentials expire during a long write operation.\n    \"\"\"\n    try:\n        import tiledbsoma as soma\n        import tiledbsoma.io as soma_io\n    except ImportError as e:\n        raise ImportError(\"Please install tiledbsoma: pip install tiledbsoma\") from e\n\n    from lamindb.core.storage.paths import auto_storage_key_from_artifact_uid\n    from lamindb.models import Artifact\n    from lamindb.models._is_versioned import create_uid\n    from lamindb.models.artifact import get_run\n\n    run = get_run(run)\n\n    appending = revises is not None\n    if appending:\n        storepath = revises.path\n    else:\n        uid, _ = create_uid(n_full_id=20)\n        storage_key = auto_storage_key_from_artifact_uid(\n            uid, \".tiledbsoma\", overwrite_versions=True\n        )\n        storepath = setup_settings.storage.root / storage_key\n\n    if storepath.protocol == \"s3\":  # type: ignore\n        ctx_factory = SOMAS3ContextFactory(storepath)\n    else:\n        ctx_factory = None\n\n    storepath_str = storepath.as_posix()\n\n    add_run_uid = True\n    run_uid_dtype = \"category\"\n    if appending:\n        ctx = None if ctx_factory is None else ctx_factory.get_context()\n        with soma.Experiment.open(storepath_str, mode=\"r\", context=ctx) as store:\n            obs_schema = store[\"obs\"].schema\n            add_run_uid = \"lamin_run_uid\" in obs_schema.names\n            # this is needed to enable backwards compatibility with tiledbsoma stores\n            # created before PR 2300\n            if add_run_uid:\n                column_type = obs_schema.types[obs_schema.names.index(\"lamin_run_uid\")]\n                if not isinstance(column_type, pa.DictionaryType):\n                    run_uid_dtype = None\n\n    if add_run_uid and run is None:\n        raise ValueError(\"Pass `run`\")\n\n    adata_objects = []\n    for adata in adatas:\n        if isinstance(adata, AnnData):\n            if add_run_uid and adata.is_view:\n                raise ValueError(\n                    \"Can not write an `AnnData` view, please do `adata.copy()` before passing.\"\n                )\n        else:\n            adata = _load_h5ad_zarr(create_path(adata))\n        if add_run_uid:\n            adata.obs[\"lamin_run_uid\"] = pd.Series(\n                run.uid, index=adata.obs.index, dtype=run_uid_dtype\n            )\n        adata_objects.append(adata)\n\n    registration_mapping = kwargs.get(\"registration_mapping\", None)\n    if registration_mapping is None and (appending or len(adata_objects) > 1):\n        ctx = None if ctx_factory is None else ctx_factory.get_context()\n        registration_mapping = soma_io.register_anndatas(\n            experiment_uri=storepath_str if appending else None,\n            adatas=adata_objects,\n            measurement_name=measurement_name,\n            obs_field_name=obs_id_name,\n            var_field_name=var_id_name,\n            append_obsm_varm=append_obsm_varm,\n            context=ctx,\n        )\n\n    prepare_experiment = False\n    resize_experiment = False\n    if registration_mapping is not None:\n        soma_version_parsed = version.parse(soma.__version__)\n        if soma_version_parsed < version.parse(\"1.15.0rc4\"):\n            n_observations = len(registration_mapping.obs_axis.data)\n        else:\n            n_observations = registration_mapping.get_obs_shape()\n            prepare_experiment = soma_version_parsed >= version.parse(\"1.16.2\")\n            resize_experiment = not prepare_experiment\n    else:  # happens only if not appending and only one adata passed\n        assert len(adata_objects) == 1  # noqa: S101\n        n_observations = adata_objects[0].n_obs\n\n    logger.important(f\"writing the tiledbsoma store to {storepath_str}\")\n    experiment_exists: bool | None = None\n    for adata_obj in adata_objects:\n        # do not recheck if True\n        if not experiment_exists and (resize_experiment or prepare_experiment):\n            ctx = None if ctx_factory is None else ctx_factory.get_context()\n            experiment_exists = soma.Experiment.exists(storepath_str, context=ctx)\n        if experiment_exists:\n            # both can only happen if registration_mapping is not None\n            if resize_experiment:\n                ctx = None if ctx_factory is None else ctx_factory.get_context()\n                soma_io.resize_experiment(\n                    storepath_str,\n                    nobs=n_observations,\n                    nvars=registration_mapping.get_var_shapes(),\n                    context=ctx,\n                )\n                resize_experiment = False\n            elif prepare_experiment:\n                ctx = None if ctx_factory is None else ctx_factory.get_context()\n                registration_mapping.prepare_experiment(storepath_str, context=ctx)\n                prepare_experiment = False\n        registration_mapping_write = (\n            registration_mapping.subset_for_anndata(adata_obj)\n            if hasattr(registration_mapping, \"subset_for_anndata\")\n            else registration_mapping\n        )\n        ctx = None if ctx_factory is None else ctx_factory.get_context()\n        soma_io.from_anndata(\n            storepath_str,\n            adata_obj,\n            measurement_name,\n            context=ctx,\n            obs_id_name=obs_id_name,\n            var_id_name=var_id_name,\n            registration_mapping=registration_mapping_write,\n            **kwargs,\n        )\n\n    artifact = Artifact(  # type: ignore\n        storepath,\n        key=key,\n        description=description,\n        run=run,\n        revises=revises,\n        _is_internal_call=True,\n    )\n    artifact.n_observations = n_observations\n    artifact.otype = \"tiledbsoma\"\n\n    return artifact.save()\n\n\n# this is less defensive than _anndata_n_observations\n# this doesn't really catches errors\n# assumes that the tiledbsoma object is well-formed\ndef _soma_store_n_observations(obj) -> int:\n    if obj.soma_type in {\"SOMADataFrame\", \"SOMASparseNDArray\", \"SOMADenseNDArray\"}:\n        return obj.non_empty_domain()[0][1] + 1\n    elif obj.soma_type == \"SOMAExperiment\":\n        return _soma_store_n_observations(obj[\"obs\"])\n    elif obj.soma_type == \"SOMAMeasurement\":\n        keys = obj.keys()\n        for slot in (\"X\", \"obsm\", \"obsp\"):\n            if slot in keys:\n                return _soma_store_n_observations(next(iter(obj[slot].values())))\n    elif obj.soma_type == \"SOMACollection\":\n        n_obs = 0\n        for value in obj.values():\n            n_obs += _soma_store_n_observations(value)\n        return n_obs\n    raise ValueError(\n        \"Could not infer the number of observations from the tiledbsoma object.\"\n    )\n\n\ndef _soma_n_observations(objectpath: UPath) -> int:\n    with _open_tiledbsoma(objectpath, mode=\"r\") as store:\n        return _soma_store_n_observations(store)\n"
  },
  {
    "path": "lamindb/core/storage/_valid_suffixes.py",
    "content": "from __future__ import annotations\n\nfrom lamindb_setup.core.upath import VALID_COMPOSITE_SUFFIXES, VALID_SIMPLE_SUFFIXES\n\n# add new composite suffixes like so\nVALID_COMPOSITE_SUFFIXES.update(\n    {\n        \".vitessce.json\",\n        \".ome.zarr\",\n    }\n)\n# can do the same for simple valid suffixes\n\n\nclass VALID_SUFFIXES:\n    \"\"\"Valid suffixes.\"\"\"\n\n    SIMPLE: set[str] = VALID_SIMPLE_SUFFIXES\n    \"\"\"Simple suffixes.\"\"\"\n    COMPOSITE: set[str] = VALID_COMPOSITE_SUFFIXES\n    \"\"\"Composite suffixes.\"\"\"\n"
  },
  {
    "path": "lamindb/core/storage/_zarr.py",
    "content": "from __future__ import annotations\n\nfrom importlib.metadata import version as get_version\nfrom typing import TYPE_CHECKING, Literal\n\nimport zarr\nfrom lamin_utils import logger\nfrom lamindb_setup.core.upath import LocalPathClasses, S3FSMap, UPath, create_mapper\nfrom packaging import version\n\nfrom lamindb.core._compat import with_package\n\nif version.parse(get_version(\"anndata\")) < version.parse(\"0.11.0\"):\n    from anndata._io import read_zarr as read_anndata_zarr\nelse:\n    from anndata.io import read_zarr as read_anndata_zarr\n\nif version.parse(zarr.__version__) >= version.parse(\"3.0.0a0\"):\n    IS_ZARR_V3 = True\n    from zarr.abc.store import Store\nelse:\n    IS_ZARR_V3 = False\n    from zarr.storage import Store  # noqa\n\nif TYPE_CHECKING:\n    from fsspec import FSMap\n    from lamindb_setup.types import AnyPathStr\n\n    from lamindb.core.storage.types import ScverseDataStructures\n\n\ndef get_zarr_store(\n    path: AnyPathStr, *, check: bool = False, create: bool = False\n) -> str | S3FSMap | FSMap | Store:\n    \"\"\"Creates the correct object that can be used to open a zarr file depending on local or remote location.\"\"\"\n    storepath, storepath_str = UPath(path), str(path)\n    if isinstance(storepath, LocalPathClasses):\n        store = storepath_str\n    elif IS_ZARR_V3:\n        # todo: also check how to treat non-asynchronous filesystems\n        # zarr has something for this, using fsspec async wrapper\n        # check FsspecStore code\n        store = zarr.storage.FsspecStore.from_upath(UPath(storepath, asynchronous=True))\n    else:\n        store = create_mapper(storepath.fs, storepath_str, check=check, create=create)\n\n    return store\n\n\ndef _identify_zarr_type_from_storage(\n    storage: zarr.Group,\n) -> Literal[\"anndata\", \"mudata\", \"spatialdata\", \"unknown\"]:\n    \"\"\"Internal helper to identify zarr type from an open storage object.\"\"\"\n    try:\n        if storage.attrs.get(\"encoding-type\", \"\") == \"anndata\":\n            return \"anndata\"\n        elif storage.attrs.get(\"encoding-type\", \"\") == \"MuData\":\n            return \"mudata\"\n        elif \"spatialdata_attrs\" in storage.attrs:\n            return \"spatialdata\"\n    except Exception as error:\n        logger.warning(f\"an exception occurred {error}\")\n    return \"unknown\"\n\n\ndef identify_zarr_type(\n    storepath: AnyPathStr, *, check: bool = True\n) -> Literal[\"anndata\", \"mudata\", \"spatialdata\", \"unknown\"]:\n    \"\"\"Identify whether a zarr store is AnnData, SpatialData, or unknown type.\"\"\"\n    suffixes = UPath(storepath).suffixes\n    if \".anndata\" in suffixes:\n        return \"anndata\"\n    elif \".mudata\" in suffixes:\n        return \"mudata\"\n    elif \".spatialdata\" in suffixes:\n        return \"spatialdata\"\n\n    store = get_zarr_store(storepath, check=check)\n    try:\n        storage = zarr.open(store, mode=\"r\")\n        return _identify_zarr_type_from_storage(storage)\n    except Exception as error:\n        logger.warning(\n            f\"an exception occured while trying to open the zarr store\\n {error}\"\n        )\n    return \"unknown\"\n\n\ndef load_zarr(\n    storepath: AnyPathStr,\n    expected_type: Literal[\"anndata\", \"mudata\", \"spatialdata\"] = None,\n) -> ScverseDataStructures:\n    \"\"\"Loads a zarr store and returns the corresponding scverse data structure.\n\n    Args:\n        storepath: Path to the zarr store\n        expected_type: If provided, ensures the zarr store is of this type (\"anndata\", \"mudata\", \"spatialdata\")\n                       and raises ValueError if it's not\n    \"\"\"\n    store = get_zarr_store(storepath, check=True)\n    # Open the storage once\n    try:\n        storage = zarr.open(store, mode=\"r\")\n    except Exception as error:\n        raise ValueError(f\"Could not open zarr store: {error}\") from None\n\n    actual_type = _identify_zarr_type_from_storage(storage)\n    if expected_type is not None and actual_type != expected_type:\n        raise ValueError(\n            f\"Expected zarr store of type '{expected_type}', but found '{actual_type}'\"\n        )\n\n    match actual_type:\n        case \"anndata\":\n            scverse_obj = read_anndata_zarr(store)\n        case \"mudata\":\n            scverse_obj = with_package(\"mudata\", lambda mod: mod.read_zarr(store))\n        case \"spatialdata\":\n            scverse_obj = with_package(\"spatialdata\", lambda mod: mod.read_zarr(store))\n        case \"unknown\" | _:\n            raise ValueError(\n                \"Unable to determine zarr store format and therefore cannot load Artifact.\"\n            )\n    return scverse_obj\n"
  },
  {
    "path": "lamindb/core/storage/objects.py",
    "content": "from __future__ import annotations\n\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, TypeAlias\n\nfrom lamindb.core._compat import (\n    with_package_obj,\n)\n\nif TYPE_CHECKING:\n    from pandas import DataFrame\n\n    from .types import ScverseDataStructures\n\n    SupportedDataTypes: TypeAlias = DataFrame | ScverseDataStructures\nelse:\n    SupportedDataTypes: TypeAlias = Any\n\n\ndef infer_suffix(\n    dmem: SupportedDataTypes, format: str | dict[str, Any] | None = None\n) -> str:\n    \"\"\"Infer LaminDB storage file suffix from a data object.\"\"\"\n    has_anndata, anndata_suffix = with_package_obj(\n        dmem,\n        \"AnnData\",\n        \"anndata\",\n        lambda obj: _infer_anndata_suffix(format),\n    )\n    if has_anndata:\n        return anndata_suffix\n\n    has_dataframe, dataframe_suffix = with_package_obj(\n        dmem,\n        \"DataFrame\",\n        \"pandas\",\n        lambda obj: _infer_dataframe_suffix(format),\n    )\n    if has_dataframe:\n        return dataframe_suffix\n\n    if with_package_obj(\n        dmem,\n        \"MuData\",\n        \"mudata\",\n        lambda obj: True,  # Just checking type, not calling any method\n    )[0]:\n        return \".h5mu\"\n\n    has_spatialdata, spatialdata_suffix = with_package_obj(\n        dmem,\n        \"SpatialData\",\n        \"spatialdata\",\n        lambda obj: _infer_spatialdata_suffix(format),\n    )\n    if has_spatialdata:\n        return spatialdata_suffix\n    else:\n        raise NotImplementedError\n\n\ndef _infer_anndata_suffix(format: str | dict[str, Any] | None) -> str:\n    assert not isinstance(format, dict)  # noqa: S101\n    if format is not None:\n        # should be `.h5ad`, `.`zarr`, or `.anndata.zarr`\n        if format not in {\"h5ad\", \"zarr\", \"anndata.zarr\"}:\n            raise ValueError(\n                \"Error when specifying AnnData storage format, it should be\"\n                f\" 'h5ad', 'zarr', not '{format}'. Check 'format'\"\n                \" or the suffix of 'key'.\"\n            )\n        return \".\" + format\n    return \".h5ad\"\n\n\ndef _infer_dataframe_suffix(format: str | dict[str, Any] | None) -> str:\n    if isinstance(format, str):\n        if format == \".csv\":\n            return \".csv\"\n    elif isinstance(format, dict):\n        if format.get(\"suffix\") == \".csv\":\n            return \".csv\"\n    return \".parquet\"\n\n\ndef _infer_spatialdata_suffix(format: str | dict[str, Any] | None) -> str:\n    if format is None:\n        return \".zarr\"\n    if isinstance(format, str) and format in {\"spatialdata.zarr\", \"zarr\"}:\n        return format\n    raise ValueError(\n        \"Error when specifying SpatialData storage format, it should be\"\n        f\" 'zarr', 'spatialdata.zarr', not '{format}'. Check 'format'\"\n        \" or the suffix of 'key'.\"\n    )\n\n\n# for types below note that local UPaths are subclasses of Path\n# Path(UPath(...)) properly coerces local UPaths and throws an error for cloud UPaths\n\n\ndef write_to_disk(dmem: SupportedDataTypes, filepath: Path | str, **kwargs) -> None:\n    \"\"\"Writes the passed in memory data to disk to a specified path.\"\"\"\n    if with_package_obj(\n        dmem,\n        \"AnnData\",\n        \"anndata\",\n        lambda obj: _write_anndata(obj, filepath, **kwargs),\n    )[0]:\n        return\n\n    if with_package_obj(\n        dmem,\n        \"DataFrame\",\n        \"pandas\",\n        lambda obj: _write_dataframe(obj, filepath, **kwargs),\n    )[0]:\n        return\n\n    if with_package_obj(dmem, \"MuData\", \"mudata\", lambda obj: obj.write(filepath))[0]:\n        return\n\n    if with_package_obj(\n        dmem,\n        \"SpatialData\",\n        \"spatialdata\",\n        lambda obj: obj.write(filepath, overwrite=True),\n    )[0]:\n        return\n\n    raise NotImplementedError\n\n\ndef _write_anndata(dmem: Any, filepath: Path | str, **kwargs) -> None:\n    suffix = Path(filepath).suffix\n    if suffix == \".h5ad\":\n        dmem.write_h5ad(filepath, **kwargs)\n        return\n    elif suffix == \".zarr\":\n        dmem.write_zarr(filepath, **kwargs)\n        return\n    else:\n        raise NotImplementedError\n\n\ndef _write_dataframe(dmem: Any, filepath: Path | str, **kwargs) -> None:\n    suffix = Path(filepath).suffix\n    if suffix == \".csv\":\n        dmem.to_csv(filepath, **kwargs)\n        return\n    dmem.to_parquet(filepath, **kwargs)\n"
  },
  {
    "path": "lamindb/core/storage/paths.py",
    "content": "from __future__ import annotations\n\nimport shutil\nfrom typing import TYPE_CHECKING\n\nimport fsspec\nfrom lamindb_setup.core import StorageSettings\nfrom lamindb_setup.core.upath import (\n    LocalPathClasses,\n    UPath,\n)\n\nfrom lamindb.core._settings import settings\n\nif TYPE_CHECKING:\n    from lamindb_setup.types import AnyPath, AnyPathStr\n\n    from lamindb.models.artifact import Artifact\n\n\nAUTO_KEY_PREFIX = \".lamindb/\"\n\n\n# add type annotations back asap when re-organizing the module\ndef auto_storage_key_from_artifact(artifact: Artifact):\n    if (real_key := artifact._real_key) is not None:\n        return real_key\n    key = artifact.key\n    if key is None or artifact._key_is_virtual:\n        return auto_storage_key_from_artifact_uid(\n            artifact.uid, artifact.suffix, artifact.overwrite_versions\n        )\n    return artifact.key\n\n\ndef auto_storage_key_from_artifact_uid(\n    uid: str, suffix: str, overwrite_versions: bool\n) -> str:\n    assert isinstance(suffix, str)  # noqa: S101 Suffix cannot be None.\n    if overwrite_versions:\n        uid_storage = uid[:16]  # 16 chars, leave 4 chars for versioning\n    else:\n        uid_storage = uid\n    storage_key = f\"{AUTO_KEY_PREFIX}{uid_storage}{suffix}\"\n    return storage_key\n\n\ndef check_path_is_child_of_root(path: AnyPathStr, root: AnyPathStr) -> bool:\n    if fsspec.utils.get_protocol(str(path)) != fsspec.utils.get_protocol(str(root)):\n        return False\n    path_upath = UPath(path)\n    root_upath = UPath(root)\n    if path_upath.protocol == \"s3\":\n        endpoint_path = path_upath.storage_options.get(\"endpoint_url\", \"\")\n        endpoint_root = root_upath.storage_options.get(\"endpoint_url\", \"\")\n        if endpoint_path != endpoint_root:\n            return False\n    # we don't resolve http links because they can resolve into a different domain\n    # for example into a temporary url\n    if path_upath.protocol not in {\"http\", \"https\"}:\n        path_upath = path_upath.resolve()\n        root_upath = root_upath.resolve()\n    # str is needed to eliminate UPath storage_options\n    # which affect equality checks\n    return UPath(str(root_upath)) in UPath(str(path_upath)).parents\n\n\n# returns filepath and root of the storage\ndef attempt_accessing_path(\n    artifact: Artifact,\n    storage_key: str,\n    using_key: str | None = None,\n    access_token: str | None = None,\n) -> tuple[UPath, StorageSettings]:\n    # check whether the file is in the default db and whether storage\n    # matches default storage\n    from lamindb.models import Storage\n\n    if (\n        artifact._state.db in (\"default\", None)\n        and artifact.storage_id == settings._storage_settings._id\n    ):\n        if access_token is None:\n            storage_settings = settings._storage_settings\n        else:\n            storage_settings = StorageSettings(\n                settings.storage.root, access_token=access_token\n            )\n    else:\n        if artifact._state.db not in (\"default\", None) and using_key is None:\n            storage = Storage.connect(artifact._state.db).get(id=artifact.storage_id)\n        else:\n            storage = Storage.objects.using(using_key).get(id=artifact.storage_id)\n        # find a better way than passing None to instance_settings in the future!\n        storage_settings = StorageSettings(storage.root, access_token=access_token)\n    path = storage_settings.key_to_filepath(storage_key)\n    return path, storage_settings\n\n\ndef filepath_from_artifact(\n    artifact: Artifact, using_key: str | None = None\n) -> tuple[UPath, StorageSettings | None]:\n    if (local_filepath := getattr(artifact, \"_local_filepath\", None)) is not None:\n        return local_filepath.resolve(), None\n    storage_key = auto_storage_key_from_artifact(artifact)\n    path, storage_settings = attempt_accessing_path(\n        artifact, storage_key, using_key=using_key\n    )\n    return path, storage_settings\n\n\n# virtual key is taken into consideration\n# only if the version is latest\ndef _cache_key_from_artifact_storage(\n    artifact: Artifact, storage_settings: StorageSettings | None\n):\n    cache_key = None\n    if (\n        artifact._key_is_virtual\n        and artifact.key is not None\n        and storage_settings is not None\n        and artifact.is_latest\n    ):\n        root = storage_settings.root\n        cache_key = (root / artifact.key).path\n        # .path does not strip protocol for http\n        # have to do it manually\n        if root.protocol in {\"http\", \"https\"}:\n            cache_key = cache_key.split(\"://\", 1)[-1]\n    return cache_key\n\n\n# return filepath and cache_key if needed\ndef filepath_cache_key_from_artifact(\n    artifact: Artifact, using_key: str | None = None\n) -> tuple[UPath, str | None]:\n    filepath, storage_settings = filepath_from_artifact(artifact, using_key)\n    if isinstance(filepath, LocalPathClasses):\n        return filepath, None\n    cache_key = _cache_key_from_artifact_storage(artifact, storage_settings)\n    return filepath, cache_key\n\n\ndef store_file_or_folder(\n    local_path: AnyPathStr, storage_path: UPath, print_progress: bool = True, **kwargs\n) -> None:\n    \"\"\"Store file or folder (localpath) at storagepath.\"\"\"\n    local_path = UPath(local_path)\n    if not isinstance(storage_path, LocalPathClasses):\n        # this uploads files and directories\n        if local_path.is_dir():\n            create_folder = False\n            try:\n                # if storage_path already exists we need to delete it\n                # if local_path is a directory\n                # to replace storage_path correctly\n                if storage_path.stat().as_info()[\"type\"] == \"directory\":\n                    storage_path.rmdir()\n                else:\n                    storage_path.unlink()\n            except (FileNotFoundError, PermissionError):\n                pass\n        else:\n            create_folder = None\n        storage_path.upload_from(\n            local_path,\n            create_folder=create_folder,\n            print_progress=print_progress,\n            **kwargs,\n        )\n    else:  # storage path is local\n        if local_path.resolve().as_posix() == storage_path.resolve().as_posix():\n            return None\n        storage_path.parent.mkdir(parents=True, exist_ok=True)\n        if local_path.is_file():\n            shutil.copyfile(local_path, storage_path)\n        else:\n            if storage_path.exists():\n                shutil.rmtree(storage_path)\n            shutil.copytree(local_path, storage_path)\n\n\ndef delete_storage_using_key(\n    artifact: Artifact,\n    storage_key: str,\n    raise_file_not_found_error: bool = True,\n    using_key: str | None = None,\n) -> None | str:\n    filepath, _ = attempt_accessing_path(artifact, storage_key, using_key=using_key)\n    return delete_storage(\n        filepath, raise_file_not_found_error=raise_file_not_found_error\n    )\n\n\ndef delete_storage(\n    storagepath: AnyPath, raise_file_not_found_error: bool = True\n) -> None | str:\n    \"\"\"Delete arbitrary artifact.\"\"\"\n    if storagepath.is_file():\n        storagepath.unlink()\n    elif storagepath.is_dir():\n        if isinstance(storagepath, LocalPathClasses):\n            shutil.rmtree(storagepath)\n        else:\n            storagepath.rmdir()\n    elif raise_file_not_found_error:\n        raise FileNotFoundError(f\"{storagepath} is not an existing path!\")\n    else:\n        return \"did-not-delete\"\n    return None\n"
  },
  {
    "path": "lamindb/core/storage/types.py",
    "content": "\"\"\"Storage-related type definitions.\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any\n\nif TYPE_CHECKING:\n    from anndata import AnnData\n    from mudata import MuData\n    from spatialdata import SpatialData\n\n    ScverseDataStructures = AnnData | MuData | SpatialData\nelse:\n    # AnnData | MuData | SpatialData; Any required for union with DataFrame in objects.py\n    ScverseDataStructures = Any\n"
  },
  {
    "path": "lamindb/core/subsettings/__init__.py",
    "content": "\"\"\"Sub settings.\n\n.. autoclass:: CreationSettings\n.. autoclass:: AnnotationSettings\n\n\"\"\"\n\nfrom ._annotation_settings import AnnotationSettings\nfrom ._creation_settings import CreationSettings\n"
  },
  {
    "path": "lamindb/core/subsettings/_annotation_settings.py",
    "content": "class AnnotationSettings:\n    n_max_records: int = 1000\n    \"\"\"Maximal number of records to annotate with during automated annotation.\n\n    If the number of records to annotate exceeds this limit, print a warning and do not annotate.\n\n    The number is calculated per feature for labels, and per schema for features.\n    \"\"\"\n\n\nannotation_settings = AnnotationSettings()\n"
  },
  {
    "path": "lamindb/core/subsettings/_creation_settings.py",
    "content": "class CreationSettings:\n    search_names: bool = True\n    \"\"\"Switch off to speed up creating records (default `True`).\n\n    If `True`, search for alternative names and avoids duplicates.\n\n    FAQ: :doc:`/faq/idempotency`\n    \"\"\"\n    artifact_skip_size_hash: bool = False\n    \"\"\"To speed up registering high numbers of files (default `False`).\n\n    This bypasses queries for size and hash to AWS & GCP.\n\n    It speeds up file creation by about a factor 100.\n    \"\"\"\n    artifact_silence_missing_run_warning: bool = False\n    \"\"\"Silence warning about missing run & transform during artifact creation (default `False`).\"\"\"\n    _artifact_use_virtual_keys: bool = True\n    \"\"\"Treat `key` parameter in :class:`~lamindb.Artifact` as virtual.\n\n    If `True`, the `key` is **not** used to construct file paths, but file paths are\n    based on the `uid` of artifact.\n    \"\"\"\n\n\ncreation_settings = CreationSettings()\n"
  },
  {
    "path": "lamindb/curators/__init__.py",
    "content": "\"\"\"Curators.\n\nHigh-level curators\n-------------------\n\n.. autoclass:: DataFrameCurator\n.. autoclass:: AnnDataCurator\n.. autoclass:: MuDataCurator\n.. autoclass:: SpatialDataCurator\n.. autoclass:: TiledbsomaExperimentCurator\n\nLow-level module\n----------------\n\n.. autosummary::\n   :toctree: .\n\n   core\n\n\"\"\"\n\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from .core import (\n        AnnDataCurator,\n        DataFrameCurator,\n        MuDataCurator,\n        SpatialDataCurator,\n        TiledbsomaExperimentCurator,\n    )\n\n__all__ = [\n    \"AnnDataCurator\",\n    \"DataFrameCurator\",\n    \"MuDataCurator\",\n    \"SpatialDataCurator\",\n    \"TiledbsomaExperimentCurator\",\n]\n\n_CURATOR_NAMES = frozenset(__all__)\n\n\ndef __getattr__(name: str):\n    \"\"\"Lazy-import curators from core to avoid loading pandas/pandera at import.\"\"\"\n    if name in _CURATOR_NAMES:\n        from . import core\n\n        attr = getattr(core, name)\n        globals()[name] = attr\n        return attr\n    raise AttributeError(f\"module {__name__!r} has no attribute {name!r}\")\n"
  },
  {
    "path": "lamindb/curators/core.py",
    "content": "\"\"\"Curator utilities.\n\n.. autoclass:: Curator\n.. autoclass:: SlotsCurator\n.. autoclass:: ComponentCurator\n.. autoclass:: CatVector\n.. autoclass:: CatLookup\n.. autoclass:: DataFrameCatManager\n\n\"\"\"\n\nfrom __future__ import annotations\n\nimport copy\nimport re\nfrom typing import TYPE_CHECKING, Any, Callable\n\nimport lamindb_setup as ln_setup\nimport numpy as np\nimport pandas as pd\nimport pandera.pandas as pandera\nfrom django.db.models import Q\nfrom lamin_utils import colors, logger\nfrom lamindb_setup.core._docs import doc_args\nfrom lamindb_setup.core.upath import LocalPathClasses\n\nfrom lamindb.base.dtypes import check_dtype\nfrom lamindb.base.types import FieldAttr  # noqa\nfrom lamindb.models import (\n    Artifact,\n    Feature,\n    Run,\n    Schema,\n    SQLRecord,\n)\nfrom lamindb.models._from_values import _format_values, _from_values\nfrom lamindb.models.artifact import (\n    data_is_scversedatastructure,\n    data_is_soma_experiment,\n)\nfrom lamindb.models.feature import (\n    parse_cat_dtype,\n    parse_dtype,\n    parse_filter_string,\n    resolve_relation_filters,\n)\nfrom lamindb.models.query_set import BasicQuerySet, SQLRecordList\nfrom lamindb.models.sqlrecord import HasType\n\nfrom ..errors import InvalidArgument, ValidationError\nfrom ..models._from_values import get_organism_record_from_field\nfrom ..models.feature import get_record_type_from_uid\n\nif TYPE_CHECKING:\n    from collections.abc import Iterable\n    from typing import Any\n\n    from anndata import AnnData\n    from mudata import MuData\n    from spatialdata import SpatialData\n    from tiledbsoma._experiment import Experiment as SOMAExperiment\n\n    from lamindb.core.storage.types import ScverseDataStructures\n\n\ndef strip_ansi_codes(text):\n    # This pattern matches ANSI escape sequences\n    ansi_pattern = re.compile(r\"\\x1b\\[[0-9;]*m\")\n    return ansi_pattern.sub(\"\", text)\n\n\nclass CatLookup:\n    \"\"\"Lookup categories from the reference instance.\n\n    Args:\n        categoricals: A dictionary of categorical fields to lookup.\n        slots: A dictionary of slot fields to lookup.\n        public: Whether to lookup from the public instance. Defaults to False.\n\n    Example::\n\n        curator = ln.curators.DataFrameCurator(...)\n        curator.cat.lookup()[\"cell_type\"].alveolar_type_1_fibroblast_cell\n\n    \"\"\"\n\n    def __init__(\n        self,\n        categoricals: list[Feature] | dict[str, FieldAttr],\n        slots: dict[str, FieldAttr] = None,\n        public: bool = False,\n        sources: dict[str, SQLRecord] | None = None,\n    ) -> None:\n        slots = slots or {}\n        if isinstance(categoricals, list):\n            categoricals = {\n                feature.name: parse_dtype(feature._dtype_str)[0][\"field\"]\n                for feature in categoricals\n            }\n        self._categoricals = {**categoricals, **slots}\n        self._public = public\n        self._sources = sources\n\n    def __getattr__(self, name):\n        if name in self._categoricals:\n            registry = self._categoricals[name].field.model\n            if self._public and hasattr(registry, \"public\"):\n                return registry.public(source=self._sources.get(name)).lookup()\n            else:\n                return registry.lookup()\n        raise AttributeError(\n            f'\"{self.__class__.__name__}\" object has no attribute \"{name}\"'\n        )\n\n    def __getitem__(self, name):\n        if name in self._categoricals:\n            registry = self._categoricals[name].field.model\n            if self._public and hasattr(registry, \"public\"):\n                return registry.public(source=self._sources.get(name)).lookup()\n            else:\n                return registry.lookup()\n        raise AttributeError(\n            f'\"{self.__class__.__name__}\" object has no attribute \"{name}\"'\n        )\n\n    def __repr__(self) -> str:\n        if len(self._categoricals) > 0:\n            getattr_keys = \"\\n \".join(\n                [f\".{key}\" for key in self._categoricals if key.isidentifier()]\n            )\n            getitem_keys = \"\\n \".join(\n                [str([key]) for key in self._categoricals if not key.isidentifier()]\n            )\n            ref = \"public\" if self._public else \"registries\"\n            return (\n                f\"Lookup objects from the {colors.italic(ref)}:\\n \"\n                f\"{colors.green(getattr_keys)}\\n \"\n                f\"{colors.green(getitem_keys)}\\n\"\n                'Example:\\n    → categories = curator.lookup()[\"cell_type\"]\\n'\n                \"    → categories.alveolar_type_1_fibroblast_cell\\n\\n\"\n                \"To look up public ontologies, use .lookup(public=True)\"\n            )\n        else:  # pragma: no cover\n            return colors.warning(\"No fields are found!\")\n\n\nCAT_MANAGER_DOCSTRING = \"\"\"Manage categoricals by updating registries.\"\"\"\n\n\nSLOTS_DOCSTRING = \"\"\"Access sub curators by slot.\"\"\"\n\nSLOTS_DETAILS_DOCSTRING = \"\"\"Uses **slots** to specify which component contains which schema. Slots are keys that identify where features are stored within composite data structures.\"\"\"\n\nVALIDATE_DOCSTRING = \"\"\"Validate dataset against Schema.\n\nRaises:\n    lamindb.errors.ValidationError: If validation fails.\n\"\"\"\n\nSAVE_ARTIFACT_DOCSTRING = \"\"\"Save an annotated artifact.\n\nArgs:\n    key: A path-like key to reference artifact in default storage, e.g., `\"myfolder/myfile.fcs\"`. Artifacts with the same key form a version family.\n    description: A description.\n    revises: Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.\n    run: The run that creates the artifact.\n\nReturns:\n    A saved artifact record.\n\"\"\"\n\nLAMINDB_COLUMN_PREFIX_REGEX = r\"^__lamindb_.*$\"\n\n\nclass Curator:\n    \"\"\"Curator base class.\n\n    A `Curator` object makes it easy to validate, standardize & annotate datasets.\n\n    See:\n        - :class:`~lamindb.curators.DataFrameCurator`\n        - :class:`~lamindb.curators.AnnDataCurator`\n        - :class:`~lamindb.curators.MuDataCurator`\n        - :class:`~lamindb.curators.SpatialDataCurator`\n        - :class:`~lamindb.curators.TiledbsomaExperimentCurator`\n    \"\"\"\n\n    def __init__(\n        self,\n        dataset: Any,\n        schema: Schema,\n        *,\n        features: dict[str, Any] | None = None,\n        require_saved_schema: bool = True,\n    ) -> None:\n        if not isinstance(schema, Schema):\n            raise InvalidArgument(\"schema argument must be a Schema record.\")\n        if require_saved_schema and schema.pk is None:\n            raise ValueError(\n                \"Schema must be saved before curation. Please save it using '.save()'.\"\n            )\n        self._artifact: Artifact | None = None\n        self._dataset: Any = None\n        # self._dataset is set below, it is opened or loaded if dataset is an Artifact\n        if isinstance(dataset, Artifact):\n            self._artifact = dataset\n            if self._artifact.otype in {\n                \"DataFrame\",\n                \"AnnData\",\n                \"MuData\",\n                \"SpatialData\",\n            }:\n                if (\n                    not isinstance(self._artifact.path, LocalPathClasses)\n                    and self._artifact.otype == \"AnnData\"\n                ):\n                    try:\n                        self._dataset = self._artifact.open(mode=\"r\")\n                        logger.important(\n                            \"opened remote artifact for streaming during validation\"\n                        )\n                    except Exception as e:\n                        logger.warning(\n                            f\"unable to open remote AnnData Artifact: {e}, falling back to loading into memory\"\n                        )\n                if self._dataset is None:\n                    logger.important(\"loading artifact into memory for validation\")\n                    self._dataset = self._artifact.load(is_run_input=False)\n            else:\n                raise InvalidArgument(\n                    f\"Cannot load or open artifact of this type: {self._artifact}\"\n                )\n        else:\n            self._dataset = dataset\n        self._schema: Schema = schema\n        self._external_features: dict[str, Any] = features\n        self._is_validated: bool = False\n\n    @doc_args(VALIDATE_DOCSTRING)\n    def validate(self) -> bool | str:\n        \"\"\"{}\"\"\"  # noqa: D415\n        pass  # pragma: no cover\n\n    @doc_args(SAVE_ARTIFACT_DOCSTRING)\n    def save_artifact(\n        self,\n        *,\n        key: str | None = None,\n        description: str | None = None,\n        revises: Artifact | None = None,\n        run: Run | None = None,\n    ) -> Artifact:\n        \"\"\"{}\"\"\"  # noqa: D415\n        # Note that this docstring has to be consistent with the Artifact()\n        # constructor signature\n        pass  # pragma: no cover\n\n    def __repr__(self) -> str:\n        from lamin_utils import colors\n\n        if self._schema is not None:\n            # Schema might have different attributes\n            if hasattr(self._schema, \"name\") and self._schema.name:\n                schema_str = colors.italic(self._schema.name)\n            elif hasattr(self._schema, \"uid\"):\n                schema_str = colors.italic(f\"uid={self._schema.uid}\")\n            elif hasattr(self._schema, \"id\"):\n                schema_str = colors.italic(f\"id={self._schema.id}\")\n            else:\n                schema_str = colors.italic(\"unnamed\")\n\n            # Add schema type info if available\n            if hasattr(self._schema, \"otype\") and self._schema.otype:\n                schema_str += f\" ({self._schema.otype})\"\n        else:\n            schema_str = colors.warning(\"None\")\n\n        status_str = \"\"\n        if self._is_validated:\n            status_str = f\", {colors.green('validated')}\"\n        else:\n            status_str = f\", {colors.yellow('unvalidated')}\"\n\n        cls_name = colors.green(self.__class__.__name__)\n\n        # Get additional info based on curator type\n        extra_info = \"\"\n        if hasattr(self, \"_slots\") and self._slots:\n            # For SlotsCurator and its subclasses\n            slots_count = len(self._slots)\n            if slots_count > 0:\n                slot_names = list(self._slots.keys())\n                if len(slot_names) <= 3:\n                    extra_info = f\", slots: {slot_names}\"\n                else:\n                    extra_info = f\", slots: [{', '.join(slot_names[:3])}... +{len(slot_names) - 3} more]\"\n        elif (\n            cls_name == \"DataFrameCurator\"\n            and hasattr(self, \"cat\")\n            and hasattr(self.cat, \"_categoricals\")\n        ):\n            # For DataFrameCurator\n            cat_count = len(getattr(self.cat, \"_categoricals\", []))\n            if cat_count > 0:\n                extra_info = f\", categorical_features={cat_count}\"\n\n        artifact_info = \"\"\n        if self._artifact is not None:\n            artifact_info = f\", artifact: {colors.italic(self._artifact.uid)}\"\n\n        return (\n            f\"{cls_name}{artifact_info}(Schema: {schema_str}{extra_info}{status_str})\"\n        )\n\n\n@doc_args(SLOTS_DETAILS_DOCSTRING)\nclass SlotsCurator(Curator):\n    \"\"\"Curator for a dataset with slots.\n\n    {}\n\n    Args:\n        dataset: The dataset to validate & annotate.\n        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.\n    \"\"\"\n\n    def __init__(\n        self,\n        dataset: Artifact | ScverseDataStructures | SOMAExperiment,\n        schema: Schema,\n        *,\n        features: dict[str, Any] | None = None,\n        require_saved_schema: bool = True,\n    ) -> None:\n        super().__init__(\n            dataset=dataset,\n            schema=schema,\n            features=features,\n            require_saved_schema=require_saved_schema,\n        )\n        self._slots: dict[str, ComponentCurator] = {}\n\n        # used for multimodal data structures (not AnnData)\n        # in form of {table/modality_key: var_field}\n        self._var_fields: dict[str, FieldAttr] = {}\n        # in form of {table/modality_key: categoricals}\n        self._cat_vectors: dict[str, dict[str, CatVector]] = {}\n\n    @property\n    @doc_args(SLOTS_DOCSTRING)\n    def slots(self) -> dict[str, ComponentCurator]:\n        \"\"\"{}\"\"\"  # noqa: D415\n        return self._slots\n\n    @doc_args(VALIDATE_DOCSTRING)\n    def validate(self) -> None:\n        \"\"\"{}\"\"\"  # noqa: D415\n        if \"__external__\" in self._schema.slots:\n            validation_schema = self._schema.slots[\"__external__\"]\n            if not self._external_features:\n                if self._artifact is not None and not self._artifact._state.adding:\n                    logger.important(\n                        \"no new external features provided, using existing external features of artifact for validation\"\n                    )\n                    self._external_features = self._artifact.features.get_values(\n                        external_only=True\n                    )\n                else:\n                    raise ValidationError(\n                        \"External features slot is defined in schema but no external features were provided.\"\n                    )\n            ExperimentalDictCurator(\n                self._external_features, validation_schema\n            ).validate()\n        for slot, curator in self._slots.items():\n            logger.debug(f\"validating slot {slot} ...\")\n            curator.validate()\n        # set _is_validated to True as no slot raised an error\n        self._is_validated = True\n\n    @doc_args(SAVE_ARTIFACT_DOCSTRING)\n    def save_artifact(\n        self,\n        *,\n        key: str | None = None,\n        description: str | None = None,\n        revises: Artifact | None = None,\n        run: Run | None = None,\n    ) -> Artifact:\n        \"\"\"{}\"\"\"  # noqa: D415\n        if not self._is_validated:\n            self.validate()\n\n        if self._artifact is None:\n            type_mapping = [\n                (\n                    lambda dataset: isinstance(dataset, pd.DataFrame),\n                    Artifact.from_dataframe,\n                ),\n                (\n                    lambda dataset: data_is_scversedatastructure(dataset, \"AnnData\"),\n                    Artifact.from_anndata,\n                ),\n                (\n                    lambda dataset: data_is_scversedatastructure(dataset, \"MuData\"),\n                    Artifact.from_mudata,\n                ),\n                (\n                    lambda dataset: data_is_scversedatastructure(\n                        dataset, \"SpatialData\"\n                    ),\n                    Artifact.from_spatialdata,\n                ),\n                (data_is_soma_experiment, Artifact.from_tiledbsoma),\n            ]\n            for type_check, af_constructor in type_mapping:\n                if type_check(self._dataset):\n                    self._artifact = af_constructor(  # type: ignore\n                        self._dataset,\n                        key=key,\n                        description=description,\n                        revises=revises,\n                        run=run,\n                    )\n                    break\n        cat_vectors = {}\n        for curator in self._slots.values():\n            for key, cat_vector in curator.cat._cat_vectors.items():\n                cat_vectors[key] = cat_vector\n        self._artifact.schema = self._schema\n        if self._external_features:\n            self._artifact._external_features = self._external_features\n        self._artifact.save()\n        return annotate_artifact(  # type: ignore\n            self._artifact,\n            curator=self,\n            cat_vectors=cat_vectors,\n        )\n\n\ndef convert_dict_to_dataframe_for_validation(d: dict, schema: Schema) -> pd.DataFrame:\n    \"\"\"Convert a dictionary to a DataFrame for validation against a schema.\"\"\"\n    df = pd.DataFrame([d])\n    for feature in schema.members:\n        # we cannot cast a `list[cat[...]]]` to categorical because lists are not hashable\n        if feature.dtype_as_str.startswith(\"cat\"):\n            if feature.name in df.columns:\n                value = df.loc[0, feature.name]\n                if isinstance(value, (list, SQLRecordList, set, BasicQuerySet)):\n                    df.attrs[feature.name] = \"list_of_categories\"\n                else:\n                    if isinstance(value, SQLRecord) and value._state.adding:\n                        raise ValidationError(\n                            f\"{value.__class__.__name__} {getattr(value, getattr(value, 'name_field', 'name'), value.uid)} is not saved.\"\n                        )\n                    df[feature.name] = pd.Categorical(df[feature.name])\n    return df\n\n\n# For more context, read https://laminlabs.slack.com/archives/C07DB677JF6/p1753994077716099 and\n# https://www.notion.so/laminlabs/Add-a-DictCurator-2422aeaa55e180b9a513f91d13970836\nclass ComponentCurator(Curator):\n    \"\"\"Curator for `DataFrame`.\n\n    Provides all key functionality to validate Pandas DataFrames.\n    This class is not user facing unlike :class:`~lamindb.curators.DataFrameCurator` which extends this\n    class with functionality to validate the `attrs` slot.\n\n    Args:\n        dataset: The DataFrame-like object to validate & annotate.\n        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.\n        slot: Indicate the slot in a composite curator for a composite data structure.\n    \"\"\"\n\n    def __init__(\n        self,\n        dataset: pd.DataFrame | Artifact,\n        schema: Schema,\n        slot: str | None = None,\n        require_saved_schema: bool = True,\n    ) -> None:\n        super().__init__(\n            dataset=dataset, schema=schema, require_saved_schema=require_saved_schema\n        )\n\n        categoricals = []\n        features = []\n        feature_ids: set[int] = set()\n\n        if schema.flexible:\n            features += Feature.filter(name__in=self._dataset.keys()).to_list()\n            feature_ids = {feature.id for feature in features}\n\n        if schema.n_members and schema.n_members > 0:\n            if schema._index_feature_uid is not None:\n                schema_features = [\n                    feature\n                    for feature in schema.members.to_list()\n                    if feature.uid != schema._index_feature_uid  # type: ignore\n                ]\n            else:\n                schema_features = schema.members.to_list()  # type: ignore\n            if feature_ids:\n                features.extend(\n                    feature\n                    for feature in schema_features\n                    if feature.id not in feature_ids  # type: ignore\n                )\n            else:\n                features.extend(schema_features)\n        else:\n            assert schema.itype is not None  # noqa: S101\n\n        pandera_columns = {}\n        self._pandera_schema = None\n        if features or schema._index_feature_uid is not None:\n            # populate features\n            if schema.minimal_set:\n                optional_feature_uids = set(schema.optionals.get_uids())\n            for feature in features:\n                if schema.minimal_set:\n                    required = feature.uid not in optional_feature_uids\n                else:\n                    required = False\n                # series.dtype is \"object\" if the column has lists types, e.g. [[\"a\", \"b\"], [\"a\"], [\"b\"]]\n                dtype_str = feature._dtype_str\n                if (\n                    dtype_str.startswith(\"list[cat\")\n                    or self._dataset.attrs.get(feature.name) == \"list_of_categories\"\n                ):\n                    pandera_columns[feature.name] = pandera.Column(\n                        dtype=None,\n                        checks=pandera.Check(\n                            check_dtype(\"list\", feature.nullable),\n                            element_wise=False,\n                            error=f\"Column '{feature.name}' failed dtype check for '{dtype_str}' against (list, nullable={feature.nullable})\",\n                        ),\n                        nullable=feature.nullable,\n                        coerce=feature.coerce,\n                        required=required,\n                    )\n                elif dtype_str in {\n                    \"int\",\n                    \"float\",\n                    \"bool\",\n                    \"num\",\n                    \"path\",\n                    \"url\",\n                } or dtype_str.startswith(\"list\"):\n                    if isinstance(self._dataset, pd.DataFrame):\n                        dtype = (\n                            self._dataset[feature.name].dtype\n                            if feature.name in self._dataset.keys()\n                            else None\n                        )\n                    else:\n                        dtype = None\n                    pandera_columns[feature.name] = pandera.Column(\n                        dtype=None,\n                        checks=pandera.Check(\n                            check_dtype(dtype_str, feature.nullable),\n                            element_wise=False,\n                            error=f\"Column '{feature.name}' failed dtype check for '{dtype_str}': got {dtype}\",\n                        ),\n                        nullable=feature.nullable,\n                        coerce=feature.coerce,\n                        required=required,\n                    )\n                elif dtype_str == \"dict\":\n                    pandera_columns[feature.name] = pandera.Column(\n                        dtype=object,\n                        nullable=feature.nullable,\n                        coerce=feature.coerce,\n                        required=required,\n                        checks=pandera.Check(\n                            lambda s: s.dropna()\n                            .apply(lambda x: isinstance(x, dict))\n                            .all(),\n                            error=\"Non-null values must be dicts\",\n                        ),\n                    )\n                else:\n                    pandera_dtype = (\n                        dtype_str if not dtype_str.startswith(\"cat\") else \"category\"\n                    )\n                    pandera_columns[feature.name] = pandera.Column(\n                        pandera_dtype,\n                        nullable=feature.nullable,\n                        coerce=feature.coerce,\n                        required=required,\n                    )\n                if dtype_str.startswith(\"cat\") or dtype_str.startswith(\"list[cat[\"):\n                    # validate categoricals if the column is required or if the column is present\n                    # but exclude the index feature from column categoricals\n                    if (required or feature.name in self._dataset.keys()) and (\n                        schema._index_feature_uid is None\n                        or feature.uid != schema._index_feature_uid\n                    ):\n                        categoricals.append(feature)\n            # in almost no case, an index should have a pandas.CategoricalDtype in a DataFrame\n            # so, we're typing it as `str` here\n            if schema.index is not None:\n                index = pandera.Index(\n                    schema.index._dtype_str\n                    if not schema.index._dtype_str.startswith(\"cat\")\n                    else str\n                )\n            else:\n                index = None\n            if schema.maximal_set:\n                # allow any columns starting with \"__lamindb\" even if maximal_set is True\n                pandera_columns[LAMINDB_COLUMN_PREFIX_REGEX] = pandera.Column(\n                    regex=True, required=False, nullable=True\n                )\n            self._pandera_schema = pandera.DataFrameSchema(\n                pandera_columns,\n                coerce=schema.coerce,\n                strict=schema.maximal_set,\n                ordered=schema.ordered_set,\n                index=index,\n            )\n        if (\n            schema.itype == \"Composite\"\n        ):  # backward compat, should be migrated to Feature.name\n            columns_field = Feature.name\n        else:\n            columns_field = parse_cat_dtype(schema.itype, is_itype=True)[\"field\"]\n        # in the DataFrameCatManager, we use the\n        # actual columns of the dataset, not the pandera columns\n        # the pandera columns might have additional optional columns\n        self._cat_manager = DataFrameCatManager(\n            self._dataset,\n            columns_field=columns_field,\n            categoricals=categoricals,\n            index=schema.index,\n            slot=slot,\n            maximal_set=schema.maximal_set,\n            schema=schema,\n        )\n\n    @property\n    @doc_args(CAT_MANAGER_DOCSTRING)\n    def cat(self) -> DataFrameCatManager:\n        \"\"\"{}\"\"\"  # noqa: D415\n        return self._cat_manager\n\n    def standardize(self) -> None:\n        \"\"\"Standardize the dataset.\n\n        - Adds missing columns for features\n        - Fills missing values for features with default values\n        \"\"\"\n        if self._artifact is not None:\n            raise RuntimeError(\n                \"Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator.\"\n            )\n\n        for feature in self._schema.members:\n            if feature.name not in self._dataset.columns:\n                if feature.default_value is not None or feature.nullable:\n                    fill_value = (\n                        feature.default_value\n                        if feature.default_value is not None\n                        else pd.NA\n                    )\n                    dtype_str = feature._dtype_str\n                    if dtype_str.startswith(\"cat\"):\n                        self._dataset[feature.name] = pd.Categorical(\n                            [fill_value] * len(self._dataset)\n                        )\n                    else:\n                        self._dataset[feature.name] = fill_value\n                    logger.important(\n                        f\"added column {feature.name} with fill value {fill_value}\"\n                    )\n                else:\n                    raise ValidationError(\n                        f\"Missing column {feature.name} cannot be added because is not nullable and has no default value\"\n                    )\n            else:\n                if feature.default_value is not None:\n                    if isinstance(\n                        self._dataset[feature.name].dtype, pd.CategoricalDtype\n                    ):\n                        if (\n                            feature.default_value\n                            not in self._dataset[feature.name].cat.categories\n                        ):\n                            self._dataset[feature.name] = self._dataset[\n                                feature.name\n                            ].cat.add_categories(feature.default_value)\n                    self._dataset[feature.name] = self._dataset[feature.name].fillna(\n                        feature.default_value\n                    )\n\n    def _cat_manager_validate(self) -> None:\n        self.cat.validate()\n\n        if self.cat._is_validated:\n            self._is_validated = True\n        else:\n            self._is_validated = False\n            raise ValidationError(self.cat._validate_category_error_messages)\n\n    @doc_args(VALIDATE_DOCSTRING)\n    def validate(self) -> None:\n        \"\"\"{}\"\"\"  # noqa: D415\n        if self._pandera_schema is not None:\n            try:\n                # first validate through pandera\n                self._pandera_schema.validate(self._dataset, lazy=True)\n                # then validate lamindb categoricals\n                self._cat_manager_validate()\n            except (pandera.errors.SchemaError, pandera.errors.SchemaErrors) as err:\n                self._is_validated = False\n                has_dtype_error = \"WRONG_DATATYPE\" in str(err)\n                error_msg = str(err)\n                if has_dtype_error:\n                    error_msg += \"   ▶ Hint: Consider setting `feature.coerce = True` to attempt coercing values during validation to the required dtype.\"\n                raise ValidationError(error_msg) from err\n        else:\n            self._cat_manager_validate()\n\n\nclass DataFrameCurator(SlotsCurator):\n    # the example in the docstring is tested in test_curators_quickstart_example\n    \"\"\"Curator for `DataFrame`.\n\n    Args:\n        dataset: The DataFrame-like object to validate & annotate.\n        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.\n        slot: Indicate the slot in a composite curator for a composite data structure.\n        require_saved_schema: Whether the schema must be saved before curation.\n\n    Examples:\n\n        For a simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_dataframe`.\n\n        Here is an example that enforces a minimal set of columns in the dataframe.\n\n        .. literalinclude:: scripts/curate_dataframe_minimal_errors.py\n            :language: python\n\n        Under-the-hood, this used the following schema.\n\n        .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py\n            :language: python\n\n        Valid features & labels were defined as:\n\n        .. literalinclude:: scripts/define_mini_immuno_features_labels.py\n            :language: python\n\n        It is also possible to curate the `attrs` slot.\n\n        .. literalinclude:: scripts/curate_dataframe_attrs.py\n            :language: python\n    \"\"\"\n\n    def __init__(\n        self,\n        dataset: pd.DataFrame | Artifact,\n        schema: Schema,\n        *,\n        slot: str | None = None,\n        features: dict[str, Any] | None = None,\n        require_saved_schema: bool = True,\n    ) -> None:\n        # loads or opens dataset, dataset may be an artifact\n        super().__init__(\n            dataset=dataset,\n            schema=schema,\n            features=features,\n            require_saved_schema=require_saved_schema,\n        )\n        # uses open dataset at self._dataset\n        self._atomic_curator = ComponentCurator(\n            dataset=self._dataset,\n            schema=schema,\n            slot=slot,\n            require_saved_schema=require_saved_schema,\n        )\n        # Handle (nested) attrs\n        if slot is None and schema.slots:\n            for slot_name, slot_schema in schema.slots.items():\n                if slot_name.startswith(\"attrs\"):\n                    path_parts = slot_name.split(\":\")\n                    attrs_dict = getattr(self._dataset, \"attrs\", None)\n                    if attrs_dict is not None:\n                        if len(path_parts) == 1:\n                            data = attrs_dict\n                        else:\n                            deeper_keys = path_parts[1:]\n                            data = _resolve_schema_slot_path(\n                                attrs_dict, deeper_keys, slot_name, \"attrs\"\n                            )\n                        df = convert_dict_to_dataframe_for_validation(data, slot_schema)\n                        self._slots[slot_name] = ComponentCurator(\n                            df,\n                            slot_schema,\n                            slot=slot_name,\n                            require_saved_schema=require_saved_schema,\n                        )\n                elif slot_name != \"__external__\":\n                    raise ValueError(\n                        f\"Slot '{slot_name}' is not supported for DataFrameCurator. Must be 'attrs'.\"\n                    )\n\n    @property\n    def cat(self) -> DataFrameCatManager:\n        \"\"\"Manage categoricals by updating registries.\"\"\"\n        return self._atomic_curator.cat\n\n    def standardize(self) -> None:\n        \"\"\"Standardize the dataset.\n\n        - Adds missing columns for features\n        - Fills missing values for features with default values\n        \"\"\"\n        self._atomic_curator.standardize()\n        for slot_curator in self._slots.values():\n            slot_curator.standardize()\n\n    @doc_args(VALIDATE_DOCSTRING)\n    def validate(self) -> None:\n        \"\"\"{}.\"\"\"\n        self._atomic_curator.validate()\n        self._is_validated = self._atomic_curator._is_validated\n        super().validate()\n\n    @doc_args(SAVE_ARTIFACT_DOCSTRING)\n    def save_artifact(\n        self, *, key=None, description=None, revises=None, run=None\n    ) -> Artifact:\n        \"\"\"{}.\"\"\"\n        if not self._is_validated:\n            self.validate()\n        self._slots[\"columns\"] = self._atomic_curator\n        try:\n            return super().save_artifact(\n                key=key, description=description, revises=revises, run=run\n            )\n        finally:\n            del self._slots[\"columns\"]\n\n\nclass ExperimentalDictCurator(DataFrameCurator):\n    \"\"\"Curator for `dict` based on `DataFrameCurator`.\"\"\"\n\n    def __init__(\n        self,\n        dataset: dict | Artifact,\n        schema: Schema,\n        slot: str | None = None,\n        require_saved_schema: bool = False,\n    ) -> None:\n        if not isinstance(dataset, dict) and not isinstance(dataset, Artifact):\n            raise InvalidArgument(\"The dataset must be a dict or dict-like artifact.\")\n        if isinstance(dataset, Artifact):\n            assert dataset.otype == \"dict\", \"Artifact must be of otype 'dict'.\"  # noqa: S101\n            d = dataset.load(is_run_input=False)\n        else:\n            d = dataset\n        df = convert_dict_to_dataframe_for_validation(d, schema)  # type: ignore\n        super().__init__(\n            df, schema, slot=slot, require_saved_schema=require_saved_schema\n        )\n\n\ndef _resolve_schema_slot_path(\n    target_dict: dict[str, Any], slot_keys: Iterable[str], slot: str, base_path: str\n) -> Any:\n    \"\"\"Resolve a schema slot path by traversing nested dictionary keys.\n\n    Args:\n        target_dict: Root dictionary to traverse\n        slot_keys: Sequence of keys defining the paths to traverse\n        slot_name: Schema slot identifier for error context\n        base_path: Base path string for error context\n\n    Returns:\n        The value at the resolved path\n    \"\"\"\n    current = target_dict\n\n    for key in slot_keys:\n        base_path += f\"['{key}']\"\n        try:\n            current = current[key]\n        except (\n            KeyError,\n            TypeError,\n        ):  # if not a dict, raises TypeError; if a dict and key not found, raises KeyError\n            available = (\n                list(current.keys())\n                if isinstance(current, dict)\n                else \"none (not a dict)\"\n            )\n            raise InvalidArgument(\n                f\"Schema slot '{slot}' requires keys {base_path} but key '{key}' \"\n                f\"not found. Available keys at this level: {available}.\"\n            ) from None\n\n    return current\n\n\ndef _handle_dict_slots(\n    dataset: ScverseDataStructures, slot: str\n) -> tuple[pd.DataFrame | None, str | None, str | None]:\n    \"\"\"Handle dict-based slot paths (uns/attrs standalone or of modalities) for all ScverseCurators.\n\n    Supports two patterns:\n        - Direct dict access: \"uns\", \"attrs\", \"uns:key1:key2\", \"attrs:key\"\n        - Modality dict access: \"modality:uns\"\n\n    Args:\n        dataset: The scverse datastructure object\n        slot: The slot path string to parse like 'uns:path:to'.\n\n    Returns:\n        tuple: (dataframe, modality_key, remaining_slot_path)\n            - dataframe: Single-row DataFrame containing the resolved data\n            - modality_key: Modality identifier if slot targets modality dict, else None\n            - remaining_slot_path: The dict attribute and nested keys as string\n    \"\"\"\n    path_parts = slot.split(\":\")\n\n    # Handle direct dict slots: \"uns\", \"attrs\", \"uns:key1:key2:...\"\n    if len(path_parts) >= 1 and path_parts[0] in [\"uns\", \"attrs\"]:\n        dict_attr = getattr(dataset, path_parts[0], None)\n        if dict_attr is not None:\n            if len(path_parts) == 1:\n                return pd.DataFrame([dict_attr]), None, path_parts[0]\n\n            deeper_keys = path_parts[1:]\n            data = _resolve_schema_slot_path(\n                dict_attr, deeper_keys, slot, path_parts[0]\n            )\n            return pd.DataFrame([data]), None, \":\".join(path_parts[1:])\n\n    # Handle modality dict slots: \"modality:uns\", \"modality:uns:key1:key2\"\n    elif len(path_parts) >= 2 and path_parts[1] in [\"uns\", \"attrs\"]:\n        modality, dict_name = path_parts[0], path_parts[1]\n        try:\n            modality_dataset = dataset[modality]\n            dict_attr = getattr(modality_dataset, dict_name, None)\n            if dict_attr is not None:\n                if len(path_parts) == 2:\n                    return pd.DataFrame([dict_attr]), modality, dict_name\n\n                deeper_keys = path_parts[2:]\n                data = _resolve_schema_slot_path(\n                    dict_attr, deeper_keys, slot, f\"{modality}.{dict_name}\"\n                )\n                return pd.DataFrame([data]), modality, \":\".join(path_parts[1:])\n        except (KeyError, AttributeError):\n            pass\n    else:\n        raise InvalidArgument(\n            f\"Invalid dict slot pattern '{slot}'. Expected formats: \"\n            f\"'uns', 'attrs', 'uns:key', 'attrs:key', 'modality:uns'\"\n        )\n\n    return None, None, None\n\n\n@doc_args(SLOTS_DETAILS_DOCSTRING)\nclass AnnDataCurator(SlotsCurator):\n    \"\"\"Curator for `AnnData`.\n\n    {}\n\n    Args:\n        dataset: The AnnData-like object to validate & annotate.\n        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.\n\n    Examples:\n\n        Curate Ensembl gene IDs and valid features in obs:\n\n        .. literalinclude:: scripts/curate_anndata_flexible.py\n            :language: python\n            :caption: curate_anndata_flexible.py\n\n        Curate `uns` dictionary:\n\n        .. literalinclude:: scripts/curate_anndata_uns.py\n            :language: python\n            :caption: curate_anndata_uns.py\n\n    See Also:\n        :meth:`~lamindb.Artifact.from_anndata`.\n    \"\"\"\n\n    def __init__(\n        self,\n        dataset: AnnData | Artifact,\n        schema: Schema,\n    ) -> None:\n        super().__init__(dataset=dataset, schema=schema)\n        if not data_is_scversedatastructure(self._dataset, \"AnnData\"):\n            raise InvalidArgument(\"dataset must be AnnData-like.\")\n        if schema.otype != \"AnnData\":\n            raise InvalidArgument(\"Schema otype must be 'AnnData'.\")\n\n        for slot, slot_schema in schema.slots.items():\n            if slot not in {\"var\", \"var.T\", \"obs\"} and not slot.startswith(\"uns\"):\n                raise ValueError(\n                    f\"AnnDataCurator currently only supports the slots 'var', 'var.T', 'obs', and 'uns', not {slot}\"\n                )\n            if slot.startswith(\"uns\"):\n                df, _, _ = _handle_dict_slots(self._dataset, slot)\n            elif slot in {\"obs\", \"var\", \"var.T\"}:\n                df = (\n                    getattr(self._dataset, slot.strip(\".T\")).T\n                    if slot == \"var.T\"\n                    or (\n                        slot == \"var\"\n                        and schema.slots[\"var\"].itype not in {None, \"Feature\"}\n                    )\n                    else getattr(self._dataset, slot)\n                )\n            self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)\n\n            # Handle var index naming for backward compat\n            if slot == \"var\" and schema.slots[\"var\"].itype not in {None, \"Feature\"}:\n                logger.warning(\n                    \"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}\"\n                )\n                self._slots[\"var\"].cat._cat_vectors[\"var_index\"] = self._slots[\n                    \"var\"\n                ].cat._cat_vectors.pop(\"columns\")\n                self._slots[\"var\"].cat._cat_vectors[\"var_index\"]._key = \"var_index\"\n\n\ndef _assign_var_fields_categoricals_multimodal(\n    modality: str | None,\n    slot_type: str,\n    slot: str,\n    slot_schema: Schema,\n    var_fields: dict[str, FieldAttr],\n    cat_vectors: dict[str, dict[str, CatVector]],\n    slots: dict[str, ComponentCurator],\n) -> None:\n    \"\"\"Assigns var_fields and categoricals for multimodal data curators.\"\"\"\n    if modality is not None:\n        var_fields[modality] = None\n        cat_vectors[modality] = {}\n\n    if slot_type == \"var\":\n        var_field = parse_cat_dtype(slot_schema.itype, is_itype=True)[\"field\"]\n        if modality is None:\n            # This should rarely/never be used since tables should have different var fields\n            var_fields[slot] = var_field  # pragma: no cover\n        else:\n            # Note that this is NOT nested since the nested key is always \"var\"\n            var_fields[modality] = var_field\n    else:\n        obs_fields = slots[slot].cat._cat_vectors\n        if modality is None:\n            cat_vectors[slot] = obs_fields\n        else:\n            # Note that this is NOT nested since the nested key is always \"obs\"\n            cat_vectors[modality] = obs_fields\n\n\n@doc_args(SLOTS_DETAILS_DOCSTRING)\nclass MuDataCurator(SlotsCurator):\n    \"\"\"Curator for `MuData`.\n\n    {}\n\n    Args:\n        dataset: The MuData-like object to validate & annotate.\n        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.\n\n    Example:\n        .. literalinclude:: scripts/curate_mudata.py\n            :language: python\n            :caption: curate_mudata.py\n\n    See Also:\n        :meth:`~lamindb.Artifact.from_mudata`.\n    \"\"\"\n\n    def __init__(\n        self,\n        dataset: MuData | Artifact,\n        schema: Schema,\n    ) -> None:\n        super().__init__(dataset=dataset, schema=schema)\n        if not data_is_scversedatastructure(self._dataset, \"MuData\"):\n            raise InvalidArgument(\"dataset must be MuData-like.\")\n        if schema.otype != \"MuData\":\n            raise InvalidArgument(\"Schema otype must be 'MuData'.\")\n\n        for slot, slot_schema in schema.slots.items():\n            # Handle slots: \"mdata.uns\", \"modality:uns\"\n            if \"uns\" in slot:\n                df, modality, modality_slot = _handle_dict_slots(self._dataset, slot)\n            else:\n                # Handle slots: \"modality:obs\", \"modality:var\"\n                parts = slot.split(\":\")\n                if len(parts) == 2:\n                    modality, modality_slot = parts\n                    try:\n                        schema_dataset = self._dataset[modality]\n                        df = getattr(schema_dataset, modality_slot.rstrip(\".T\"))\n                    except KeyError:\n                        raise InvalidArgument(\n                            f\"Modality '{modality}' not found in MuData\"\n                        ) from None\n                    except AttributeError:\n                        raise InvalidArgument(\n                            f\"Attribute '{modality_slot}' not found on modality '{modality}'\"\n                        ) from None\n                else:\n                    # Handle slots: \"mdata:obs\", \"mdata:var\" (uns is a dictionary and gets handled above)\n                    modality, modality_slot = None, slot\n                    schema_dataset = self._dataset\n                    df = getattr(schema_dataset, modality_slot.rstrip(\".T\"))\n\n            # Transpose var if necessary\n            if modality_slot == \"var\" and schema.slots[slot].itype not in {\n                None,\n                \"Feature\",\n            }:\n                logger.warning(\n                    \"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}\"\n                )\n                df = df.T\n            elif modality_slot == \"var.T\":\n                df = df.T\n\n            self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)\n\n            _assign_var_fields_categoricals_multimodal(\n                modality=modality,\n                slot_type=modality_slot,\n                slot=slot,\n                slot_schema=slot_schema,\n                var_fields=self._var_fields,\n                cat_vectors=self._cat_vectors,\n                slots=self._slots,\n            )\n\n        self._columns_field = self._var_fields\n\n\n@doc_args(SLOTS_DETAILS_DOCSTRING)\nclass SpatialDataCurator(SlotsCurator):\n    \"\"\"Curator for `SpatialData`.\n\n    {}\n\n    Args:\n        dataset: The SpatialData-like object to validate & annotate.\n        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.\n\n    Example:\n        .. literalinclude:: scripts/curate_spatialdata.py\n            :language: python\n            :caption: curate_spatialdata.py\n\n    See Also:\n        :meth:`~lamindb.Artifact.from_spatialdata`.\n    \"\"\"\n\n    def __init__(\n        self,\n        dataset: SpatialData | Artifact,\n        schema: Schema,\n    ) -> None:\n        super().__init__(dataset=dataset, schema=schema)\n        if not data_is_scversedatastructure(self._dataset, \"SpatialData\"):\n            raise InvalidArgument(\"dataset must be SpatialData-like.\")\n        if schema.otype != \"SpatialData\":\n            raise InvalidArgument(\"Schema otype must be 'SpatialData'.\")\n\n        for slot, slot_schema in schema.slots.items():\n            # Handle slots: \"sdata:attrs\"\n            if slot.startswith(\"attrs\"):\n                df, table_key, table_slot = _handle_dict_slots(self._dataset, slot)\n            else:\n                parts = slot.split(\":\")\n                # Handle slots: \"tables:table_key:obs\", \"tables:table_key:var\"\n                if len(parts) == 3 and parts[0] == \"tables\":\n                    table_key, table_slot = parts[1], parts[2]\n                    try:\n                        slot_object = self._dataset.tables[table_key]\n                        df = getattr(slot_object, table_slot.rstrip(\".T\"))\n                    except KeyError:\n                        raise InvalidArgument(\n                            f\"Table '{table_key}' not found in sdata.tables\"\n                        ) from None\n                    except AttributeError:\n                        raise InvalidArgument(\n                            f\"Attribute '{table_slot}' not found on table '{table_key}'\"\n                        ) from None\n                else:\n                    # Handle legacy single keys for backward compatibility\n                    if len(parts) == 1 and parts[0] != \"attrs\":\n                        logger.warning(\n                            f\"please prefix slot {slot} with 'attrs:' going forward\"\n                        )\n                        try:\n                            df = pd.DataFrame([self._dataset.attrs[slot]])\n                            table_key = None\n                            table_slot = slot\n                        except KeyError:\n                            raise InvalidArgument(\n                                f\"Slot '{slot}' not found in sdata.attrs\"\n                            ) from None\n                    else:\n                        raise InvalidArgument(f\"Unrecognized slot format: {slot}\")\n\n            # Handle var transposition logic\n            if table_slot == \"var\" and schema.slots[slot].itype not in {\n                None,\n                \"Feature\",\n            }:\n                logger.warning(\n                    \"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}\"\n                )\n                df = df.T\n            elif table_slot == \"var.T\":\n                df = df.T\n\n            self._slots[slot] = ComponentCurator(df, slot_schema, slot)\n\n            _assign_var_fields_categoricals_multimodal(\n                modality=table_key,\n                slot_type=table_slot,\n                slot=slot,\n                slot_schema=slot_schema,\n                var_fields=self._var_fields,\n                cat_vectors=self._cat_vectors,\n                slots=self._slots,\n            )\n\n        self._columns_field = self._var_fields\n\n\n@doc_args(SLOTS_DETAILS_DOCSTRING)\nclass TiledbsomaExperimentCurator(SlotsCurator):\n    \"\"\"Curator for `tiledbsoma.Experiment`.\n\n    {}\n\n    Args:\n        dataset: The `tiledbsoma.Experiment` object.\n        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.\n\n    Example:\n\n        .. literalinclude:: scripts/curate_soma_experiment.py\n            :language: python\n            :caption: curate_soma_experiment.py\n\n    See Also:\n        :meth:`~lamindb.Artifact.from_tiledbsoma`.\n    \"\"\"\n\n    def __init__(\n        self,\n        dataset: SOMAExperiment | Artifact,\n        schema: Schema,\n    ) -> None:\n        super().__init__(dataset=dataset, schema=schema)\n        if not data_is_soma_experiment(self._dataset):\n            raise InvalidArgument(\"dataset must be SOMAExperiment-like.\")\n        if schema.otype != \"tiledbsoma\":\n            raise InvalidArgument(\"Schema otype must be 'tiledbsoma'.\")\n\n        for slot, slot_schema in schema.slots.items():\n            if slot.startswith(\"ms:\"):\n                _, modality_slot = slot.split(\":\")\n                schema_dataset = (\n                    self._dataset.ms[modality_slot.removesuffix(\".T\")]\n                    .var.read()\n                    .concat()\n                    .to_pandas()\n                    .drop(\"soma_joinid\", axis=1, errors=\"ignore\")\n                )\n\n                self._slots[slot] = ComponentCurator(\n                    (schema_dataset.T if modality_slot == \"var.T\" else schema_dataset),\n                    slot_schema,\n                )\n            else:\n                # global Experiment obs slot\n                modality_slot = slot\n                schema_dataset = (\n                    self._dataset.obs.read()\n                    .concat()\n                    .to_pandas()\n                    .drop([\"soma_joinid\", \"obs_id\"], axis=1, errors=\"ignore\")\n                )\n                self._slots[slot] = ComponentCurator(\n                    schema_dataset,\n                    slot_schema,\n                )\n\n            _assign_var_fields_categoricals_multimodal(\n                modality=slot,  # not passing `measurement` here because it's a constant. The slot has the actual modality\n                slot_type=modality_slot,\n                slot=slot,\n                slot_schema=slot_schema,\n                var_fields=self._var_fields,\n                cat_vectors=self._cat_vectors,\n                slots=self._slots,\n            )\n        self._columns_field = self._var_fields\n\n\nclass CatVector:\n    \"\"\"Vector with categorical values.\"\"\"\n\n    def __init__(\n        self,\n        values_getter: Callable\n        | Iterable[str],  # A callable or iterable that returns the values to validate.\n        field: FieldAttr,  # The field to validate against.\n        key: str,  # The name of the vector to validate. Only used for logging.\n        values_setter: Callable | None = None,  # A callable that sets the values.\n        source: SQLRecord | None = None,  # The ontology source to validate against.\n        feature: Feature | None = None,\n        cat_manager: DataFrameCatManager | None = None,\n        filter_str: str = \"\",\n        record_uid: str | None = None,\n        maximal_set: bool = True,  # whether unvalidated categoricals cause validation failure.\n        schema: Schema = None,\n    ) -> None:\n        self._values_getter = values_getter\n        self._values_setter = values_setter\n        self._field = field\n        self._key = key\n        self._source = source\n        self._validated: None | list[str] = None\n        self._non_validated: None | list[str] = None\n        self._synonyms: None | dict[str, str] = None\n        self._record_uid = record_uid\n        self._subtype_query_set = None\n        self._cat_manager = cat_manager\n        self.feature = feature\n        self.records = None\n        self._maximal_set = maximal_set\n        self._type_record = None\n        self._registry = self._field.field.model\n        self._field_name = self._field.field.name\n        self._filter_kwargs = {}\n        self._schema = schema\n        if filter_str and filter_str != \"unsaved\":\n            self._filter_kwargs.update(\n                resolve_relation_filters(\n                    parse_filter_string(filter_str), self._registry\n                )  # type: ignore\n            )\n        if self._registry.__base__.__name__ == \"BioRecord\":\n            if self._source is not None:\n                self._filter_kwargs[\"source\"] = self._source\n            organism_record = get_organism_record_from_field(\n                field=self._field,\n                organism=self._filter_kwargs.get(\"organism\"),\n                values=self.values,\n            )\n            if organism_record is not None:\n                self._filter_kwargs[\"organism\"] = organism_record\n        self._filter_kwargs = get_current_filter_kwargs(\n            self._registry, self._filter_kwargs\n        )\n\n        # get the dtype associated record based on the record_uid\n        if self._record_uid:\n            self._type_record = get_record_type_from_uid(\n                self._registry,\n                self._record_uid,\n            )\n\n        if hasattr(self._registry, \"_name_field\"):\n            label_ref_is_name = self._field_name == self._registry._name_field\n        else:\n            label_ref_is_name = self._field_name == \"name\"\n        self.label_ref_is_name = label_ref_is_name\n\n    @property\n    def values(self):\n        \"\"\"Get the current values using the getter function.\"\"\"\n        if callable(self._values_getter):\n            return self._values_getter()\n        return self._values_getter\n\n    @values.setter\n    def values(self, new_values):\n        \"\"\"Set new values using the setter function if available.\"\"\"\n        if callable(self._values_setter):\n            self._values_setter(new_values)\n        else:\n            # If values_getter is not callable, it's a direct reference we can update\n            self._values_getter = new_values\n\n    @property\n    def is_validated(self) -> bool:\n        \"\"\"Whether the vector is validated.\"\"\"\n        # if nothing was validated, something likely is fundamentally wrong\n        # should probably add a setting `at_least_one_validated`\n        result = True\n        if len(self.values) > 0 and len(self.values) == len(self._non_validated):\n            logger.warning(f\"no values were validated for {self._key}!\")\n        # len(self._non_validated) != 0\n        #     if maximal_set is True, return False\n        #     if maximal_set is False, return True\n        # len(self._non_validated) == 0\n        #     return True\n        if len(self._non_validated) != 0:\n            if self._maximal_set:\n                result = False\n        return result\n\n    def _replace_synonyms(self) -> list[str]:\n        \"\"\"Replace synonyms in the vector with standardized values.\"\"\"\n\n        def process_value(value, syn_mapper):\n            \"\"\"Helper function to process values recursively.\"\"\"\n            if isinstance(value, list):\n                # Handle list - recursively process each item\n                return [process_value(item, syn_mapper) for item in value]\n            else:\n                # Handle single value\n                return syn_mapper.get(value, value)\n\n        syn_mapper = self._synonyms\n        # replace the values in df\n        std_values = self.values.map(\n            lambda unstd_val: process_value(unstd_val, syn_mapper)\n        )\n        # remove the standardized values from self.non_validated\n        non_validated = [i for i in self._non_validated if i not in syn_mapper]\n        if len(non_validated) == 0:\n            self._non_validated = []\n        else:\n            self._non_validated = non_validated  # type: ignore\n        # logging\n        n = len(syn_mapper)\n        if n > 0:\n            syn_mapper_print = _format_values(\n                [f'\"{k}\" → \"{v}\"' for k, v in syn_mapper.items()], sep=\"\"\n            )\n            s = \"s\" if n > 1 else \"\"\n            logger.success(\n                f'standardized {n} synonym{s} in \"{self._key}\": {colors.green(syn_mapper_print)}'\n            )\n        return std_values\n\n    def __repr__(self) -> str:\n        if self._non_validated is None:\n            status = \"unvalidated\"\n        else:\n            status = (\n                \"validated\"\n                if len(self._non_validated) == 0\n                else f\"non-validated ({len(self._non_validated)})\"\n            )\n\n        field_name = getattr(self._field, \"name\", str(self._field))\n        values_count = len(self.values) if hasattr(self.values, \"__len__\") else \"?\"\n        return f\"CatVector(key='{self._key}', field='{field_name}', values={values_count}, {status})\"\n\n    def _add_validated(self) -> tuple[list, list]:\n        \"\"\"Save features or labels records in the default instance.\"\"\"\n        from lamindb.models.has_parents import keep_topmost_matches\n        from lamindb.models.save import save as ln_save\n\n        model_field = self._registry.__get_name_with_module__()\n\n        values = [\n            value\n            for value in self.values\n            if (isinstance(value, str) and value)\n            or (\n                isinstance(value, (int, float))\n                and not isinstance(value, bool)\n                and value == value\n            )\n            or (isinstance(value, list) and value)\n            or (\n                isinstance(value, np.ndarray) and value.size > 0 and value.dtype != bool\n            )\n        ]\n        if not values:\n            return [], []\n\n        # if a value is a list, we need to flatten it\n        str_values = _flatten_unique(values)\n\n        # if values are SQLRecord, we don't need to validate them\n        if all(isinstance(v, SQLRecord) for v in str_values):\n            assert all(v._state.adding is False for v in str_values), (\n                \"All records must be saved.\"\n            )\n            self.records = str_values  # type: ignore\n            validated_values = str_values  # type: ignore\n            return validated_values, []\n\n        # get all field specs for union types\n        if self.feature:\n            results = parse_dtype(self.feature._dtype_str)\n        else:\n            results = [None]\n\n        all_validated = []\n        all_records = []\n        remaining_values = str_values\n\n        for result in results:\n            if not remaining_values:\n                break  # pragma: no cover\n\n            if result is not None:\n                field = result[\"field\"]\n                registry = field.field.model\n                field_name = field.field.name\n                filter_kwargs: dict[str, str | SQLRecord] = {}\n                filter_str = result.get(\"filter_str\", \"\")\n                if filter_str:\n                    parsed_filters = parse_filter_string(filter_str)\n                    filter_kwargs.update(\n                        resolve_relation_filters(parsed_filters, registry)\n                    )\n                if registry.__base__.__name__ == \"BioRecord\":\n                    organism_record = get_organism_record_from_field(\n                        field=field,\n                        organism=None,\n                        values=remaining_values,\n                    )\n                    if organism_record is not None:\n                        filter_kwargs[\"organism\"] = organism_record\n                # Merge in self._filter_kwargs (contains cat_filters from Feature)\n                if self._filter_kwargs:\n                    filter_kwargs.update(self._filter_kwargs)\n                filter_kwargs = get_current_filter_kwargs(registry, filter_kwargs)\n            else:\n                field = self._field\n                registry = self._registry\n                field_name = self._field_name\n                filter_kwargs = self._filter_kwargs\n\n            # inspect the default instance and save validated records from public\n            if issubclass(registry, HasType):\n                if self._type_record is None:\n                    # When we have a Schema with typed members,\n                    # scope the query to the types present in the schema's members (plus untyped features)\n                    # to avoid ambiguous matches across different feature types.\n                    qs = registry.filter()\n                    if self._schema and self._schema.n_members:\n                        type_ids = {\n                            m.type_id\n                            for m in self._schema.members\n                            if m.type_id is not None\n                        }\n                        if type_ids:\n                            qs = registry.filter(\n                                Q(type_id__in=type_ids) | Q(type_id__isnull=True)\n                            )\n                    self._subtype_query_set = qs\n                else:\n                    query_sub_types = getattr(\n                        self._type_record, f\"query_{registry.__name__.lower()}s\"\n                    )\n                    self._subtype_query_set = query_sub_types()\n                subtype_query_set = (\n                    self._subtype_query_set.filter(**filter_kwargs)\n                    if filter_kwargs\n                    else self._subtype_query_set\n                )\n                values_array = np.array(remaining_values)\n                validated_mask = subtype_query_set.validate(\n                    values_array, field=field, mute=True\n                )\n                validated_values, non_validated_values = (\n                    list(set(values_array[validated_mask])),\n                    list(set(values_array[~validated_mask])),\n                )\n                records = subtype_query_set.filter(\n                    **{f\"{field_name}__in\": validated_values}\n                ).to_list()\n                records = keep_topmost_matches(records)\n            else:\n                existing_and_public_records = _from_values(\n                    remaining_values,\n                    field=field,\n                    mute=True,\n                    **filter_kwargs,  # type: ignore\n                )\n                existing_and_public_values = [\n                    getattr(r, field_name) for r in existing_and_public_records\n                ]\n                # public records that are not already in the database\n                public_records = [\n                    r for r in existing_and_public_records if r._state.adding\n                ]\n                if len(public_records) > 0:\n                    logger.info(f\"saving validated records of '{self._key}'\")\n                    ln_save(public_records)\n                    values_saved_public = [\n                        getattr(r, field_name) for r in public_records\n                    ]\n                    # log the saved public labels\n                    # the term \"transferred\" stresses that this is always in the context of transferring\n                    # labels from a public ontology or a different instance to the present instance\n                    if len(values_saved_public) > 0:\n                        s = \"s\" if len(values_saved_public) > 1 else \"\"\n                        logger.success(\n                            f'added {len(values_saved_public)} record{s} {colors.green(\"from_public\")} with {model_field} for \"{self._key}\": {_format_values(values_saved_public)}'\n                        )\n                        # non-validated records from the default instance\n                non_validated_values = [\n                    i for i in remaining_values if i not in existing_and_public_values\n                ]\n                validated_values = existing_and_public_values\n                records = existing_and_public_records\n\n            all_validated.extend(validated_values)\n            all_records.extend(records)\n            remaining_values = non_validated_values\n\n        self.records = all_records\n        # validated values, non-validated values\n        return all_validated, remaining_values\n\n    def _add_new(\n        self,\n        values: list[str],\n        df: pd.DataFrame | None = None,  # remove when all users use schema\n        dtype: str | None = None,\n        **create_kwargs,\n    ) -> None:\n        \"\"\"Add new labels to the registry.\"\"\"\n        from lamindb.models.save import save as ln_save\n\n        non_validated_records: SQLRecordList[Any] = []  # type: ignore\n        if df is not None and self._registry == Feature:\n            nonval_columns = Feature.inspect(df.columns, mute=True).non_validated\n            non_validated_records = Feature.from_dataframe(df.loc[:, nonval_columns])\n        else:\n            organism_record = self._filter_kwargs.get(\"organism\", None)\n            for value in values:\n                init_kwargs = {self._field_name: value}\n                if self._registry == Feature:\n                    init_kwargs[\"dtype\"] = \"cat\" if dtype is None else dtype\n                if self._type_record is not None:\n                    # if type_record is set, we need to set the type for new records\n                    init_kwargs[\"type\"] = self._type_record\n                if organism_record is not None:\n                    init_kwargs[\"organism\"] = organism_record\n                # here we create non-validated records skipping validation since we already ensured that they don't exist\n                non_validated_records.append(\n                    self._registry(\n                        **init_kwargs, **create_kwargs, _skip_validation=True\n                    )\n                )\n        if len(non_validated_records) > 0:\n            ln_save(non_validated_records)\n            model_field = colors.italic(self._registry.__get_name_with_module__())\n            s = \"s\" if len(values) > 1 else \"\"\n            logger.success(\n                f'added {len(values)} record{s} with {model_field} for \"{self._key}\": {_format_values(values)}'\n            )\n\n    def _validate(\n        self,\n        values: list[str],\n    ) -> tuple[list[str], dict]:\n        \"\"\"Validate ontology terms using LaminDB registries.\"\"\"\n        model_field = f\"{self._registry.__name__}.{self._field_name}\"\n\n        # get all field specs for union types\n        if self.feature:\n            results = parse_dtype(self.feature._dtype_str)\n        else:\n            results = [{\"field\": self._field}]\n\n        non_validated = values\n        syn_mapper: dict[str, str] = {}\n\n        for result in results:\n            if not non_validated:\n                break\n            field = result[\"field\"]\n            registry = field.field.model\n            filter_kwargs = self._filter_kwargs.copy()\n            filter_str = result.get(\"filter_str\", \"\")\n            if filter_str:\n                parsed_filters = parse_filter_string(filter_str)\n                filter_kwargs.update(resolve_relation_filters(parsed_filters, registry))\n            registry_or_queryset = registry\n            if self._subtype_query_set is not None and registry == self._registry:\n                registry_or_queryset = self._subtype_query_set\n            # first inspect against the registry\n            inspect_result = registry_or_queryset.filter(**filter_kwargs).inspect(\n                non_validated,\n                field=field,\n                mute=True,\n                from_source=False,\n            )\n            # here non_validated includes synonyms and new values\n            non_validated = inspect_result.non_validated\n            syn_mapper.update(inspect_result.synonyms_mapper)\n\n        # logging messages\n        if self._cat_manager is not None:\n            slot = self._cat_manager._slot\n        else:\n            slot = None\n        in_slot = f\" in slot '{slot}'\" if slot is not None else \"\"\n        slot_prefix = f\".slots['{slot}']\" if slot is not None else \"\"\n        non_validated_hint_print = (\n            f\"curator{slot_prefix}.cat.add_new_from('{self._key}')\"\n        )\n        n_non_validated = len(non_validated)\n        if n_non_validated == 0:\n            logger.success(\n                f'\"{self._key}\" is validated against {colors.italic(model_field)}'\n            )\n            return [], {}\n        else:\n            s = \"\" if n_non_validated == 1 else \"s\"\n            print_values = _format_values(non_validated)\n            warning_message = f\"{colors.red(f'{n_non_validated} term{s}')} not validated in feature '{self._key}'{in_slot}: {colors.red(print_values)}\\n\"\n            # log synonyms if any\n            if syn_mapper:\n                s = \"\" if len(syn_mapper) == 1 else \"s\"\n                syn_mapper_print = _format_values(\n                    [f'\"{k}\" → \"{v}\"' for k, v in syn_mapper.items()], sep=\"\"\n                )\n                hint_msg = f'.standardize(\"{self._key}\")'\n                warning_message += f\"    {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\\n    → curate synonyms via: {colors.cyan(hint_msg)}\"\n            if n_non_validated > len(syn_mapper):\n                if syn_mapper:\n                    warning_message += \"\\n    for remaining terms:\\n\"\n                check_organism = \"\"\n                if (\n                    self._registry.__base__.__name__ == \"BioRecord\"\n                    and self._registry.require_organism(field=self._field)\n                ):\n                    organism = self._filter_kwargs.get(\"organism\", None)\n                    check_organism = f\"fix organism '{organism}', \"\n                warning_message += f\"    → {check_organism}fix typos, remove non-existent values, or save terms via: {colors.cyan(non_validated_hint_print)}\"\n                if self._subtype_query_set is not None and self._type_record:\n                    warning_message += f\"\\n    → a valid label for subtype '{self._type_record.name}' has to be one of {self._subtype_query_set.to_list('name')}\"\n            logger.info(f'mapping \"{self._key}\" on {colors.italic(model_field)}')\n            logger.warning(warning_message)\n            if self._cat_manager is not None:\n                self._cat_manager._validate_category_error_messages = strip_ansi_codes(\n                    warning_message\n                )\n            return non_validated, syn_mapper\n\n    def validate(self) -> None:\n        \"\"\"Validate the vector.\"\"\"\n        # add source-validated values to the registry\n        self._validated, self._non_validated = self._add_validated()\n        self._non_validated, self._synonyms = self._validate(values=self._non_validated)\n\n    def standardize(self) -> None:\n        \"\"\"Standardize the vector.\"\"\"\n        if not hasattr(self._registry, \"standardize\"):\n            return self.values\n        if self._synonyms is None:\n            self.validate()\n        # get standardized values\n        std_values = self._replace_synonyms()\n        # update non_validated values\n        self._non_validated = [\n            i for i in self._non_validated if i not in self._synonyms.keys()\n        ]\n        # remove synonyms since they are now standardized\n        self._synonyms = {}\n        # update the values with the standardized values\n        self.values = std_values\n\n    def add_new(self, **create_kwargs) -> None:\n        \"\"\"Add new values to the registry.\"\"\"\n        if self._non_validated is None:\n            self.validate()\n        if len(self._synonyms) > 0:\n            # raise error because .standardize modifies the input dataset\n            raise ValidationError(\n                \"Please run `.standardize()` before adding new values.\"\n            )\n        self._add_new(\n            values=self._non_validated,\n            **create_kwargs,\n        )\n        # remove the non_validated values since they are now registered\n        self._non_validated = []\n\n\nclass DataFrameCatManager:\n    \"\"\"Manage categoricals by updating registries.\n\n    This class is accessible from within a `DataFrameCurator` via the `.cat` attribute.\n\n    If you find non-validated values, you have two options:\n\n    - new values found in the data can be registered via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.core.DataFrameCatManager.add_new_from`\n    - non-validated values can be accessed via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.core.DataFrameCatManager.non_validated` and addressed manually\n    \"\"\"\n\n    def __init__(\n        self,\n        df: pd.DataFrame | Artifact,\n        columns_field: FieldAttr = Feature.name,\n        categoricals: list[Feature] | None = None,\n        sources: dict[str, SQLRecord] | None = None,\n        index: Feature | None = None,\n        slot: str | None = None,\n        maximal_set: bool = False,\n        schema: Schema | None = None,\n    ) -> None:\n        self._non_validated = None\n        self._index = index\n        self._artifact: Artifact = None  # pass the dataset as an artifact\n        self._dataset: Any = df  # pass the dataset as an AnyPathStr or data object\n        if isinstance(self._dataset, Artifact):\n            self._artifact = self._dataset\n            self._dataset = self._dataset.load(is_run_input=False)\n        self._is_validated: bool = False\n        self._categoricals = categoricals or []\n        self._non_validated = None\n        self._sources = sources or {}\n        self._columns_field = columns_field\n        self._validate_category_error_messages: str = \"\"\n        self._cat_vectors: dict[str, CatVector] = {}\n        self._slot = slot\n        self._maximal_set = maximal_set\n        columns = self._dataset.keys()\n        if maximal_set:\n            columns = [\n                col for col in columns if not re.match(LAMINDB_COLUMN_PREFIX_REGEX, col)\n            ]\n        self._cat_vectors[\"columns\"] = CatVector(\n            values_getter=lambda: columns,  # lambda ensures the inplace update\n            values_setter=lambda new_values: setattr(\n                self._dataset, \"columns\", pd.Index(new_values)\n            )\n            if isinstance(self._dataset, pd.DataFrame)\n            else None,\n            field=columns_field,\n            key=\"columns\" if isinstance(self._dataset, pd.DataFrame) else \"keys\",\n            source=self._sources.get(\"columns\"),\n            cat_manager=self,\n            maximal_set=self._maximal_set,\n            filter_str=\"\"\n            if schema.flexible\n            else \"unsaved\"\n            if schema.id is None\n            else f\"schemas__id={schema.id}\",\n            schema=schema,\n        )\n        for feature in self._categoricals:\n            result = parse_dtype(feature._dtype_str)[0]\n            key = feature.name\n            # only create CatVector if the key exists in the DataFrame\n            if key in self._dataset.columns:\n                self._cat_vectors[key] = CatVector(\n                    values_getter=lambda k=key: self._dataset[\n                        k\n                    ],  # Capture key as default argument\n                    values_setter=lambda new_values, k=key: self._dataset.__setitem__(\n                        k, new_values\n                    ),\n                    field=result[\"field\"],\n                    key=key,\n                    source=self._sources.get(key),\n                    feature=feature,\n                    cat_manager=self,\n                    filter_str=result[\"filter_str\"],\n                    record_uid=result.get(\"record_uid\"),\n                )\n        if index is not None and index._dtype_str.startswith(\"cat\"):\n            result = parse_dtype(index._dtype_str)[0]\n            key = \"index\"\n            self._cat_vectors[key] = CatVector(\n                values_getter=self._dataset.index,\n                values_setter=lambda new_values: setattr(\n                    self._dataset, \"index\", new_values\n                ),\n                field=result[\"field\"],\n                key=key,\n                feature=index,\n                cat_manager=self,\n                filter_str=result[\"filter_str\"],\n                record_uid=result.get(\"record_uid\"),\n            )\n\n    @property\n    def non_validated(self) -> dict[str, list[str]]:\n        \"\"\"Return the non-validated features and labels.\"\"\"\n        if self._non_validated is None:\n            raise ValidationError(\"Please run validate() first!\")\n        return {\n            key: cat_vector._non_validated\n            for key, cat_vector in self._cat_vectors.items()\n            if cat_vector._non_validated and key != \"columns\"\n        }\n\n    @property\n    def categoricals(self) -> list[Feature]:\n        \"\"\"The categorical features.\"\"\"\n        return self._categoricals\n\n    def __repr__(self) -> str:\n        cls_name = colors.green(self.__class__.__name__)\n\n        status_str = (\n            f\"{colors.green('validated')}\"\n            if self._is_validated\n            else f\"{colors.yellow('unvalidated')}\"\n        )\n\n        info_parts = []\n\n        cat_count = len(self._categoricals)\n        if cat_count > 0:\n            info_parts.append(f\"categorical_features={cat_count}\")\n\n        if self._slot:\n            info_parts.append(f\"slot: {colors.italic(self._slot)}\")\n\n        info_str = \", \".join(info_parts)\n        if info_str:\n            return f\"{cls_name}({info_str}, {status_str})\"\n        else:\n            return f\"{cls_name}({status_str})\"\n\n    def lookup(self, public: bool = False) -> CatLookup:\n        \"\"\"Lookup categories.\n\n        Args:\n            public: If \"public\", the lookup is performed on the public reference.\n        \"\"\"\n        return CatLookup(\n            categoricals=self._categoricals,\n            slots={\"columns\": self._columns_field},\n            public=public,\n            sources=self._sources,\n        )\n\n    def validate(self) -> bool:\n        \"\"\"Validate variables and categorical observations.\"\"\"\n        self._validate_category_error_messages = \"\"  # reset the error messages\n        validated = True\n        for key, cat_vector in self._cat_vectors.items():\n            logger.info(f\"validating vector {key}\")\n            cat_vector.validate()\n            validated &= cat_vector.is_validated\n        self._is_validated = validated\n        self._non_validated = {}  # type: ignore\n\n        if self._index is not None:\n            # cat_vector.validate() populates validated labels\n            # the index should become part of the feature set corresponding to the dataframe\n            if self._cat_vectors[\"columns\"].records is not None:\n                self._cat_vectors[\"columns\"].records.insert(0, self._index)  # type: ignore\n            else:\n                self._cat_vectors[\"columns\"].records = [self._index]  # type: ignore\n\n        return self._is_validated\n\n    def standardize(self, key: str) -> None:\n        \"\"\"Replace synonyms with standardized values.\n\n        Modifies the input dataset inplace.\n\n        Args:\n            key: The key referencing the column in the DataFrame to standardize.\n        \"\"\"\n        if self._artifact is not None:\n            raise RuntimeError(\n                \"Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator.\"\n            )\n\n        if key == \"all\":\n            logger.warning(\n                \"'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!\"\n            )\n            for k in self.non_validated.keys():\n                self._cat_vectors[k].standardize()\n        else:\n            self._cat_vectors[key].standardize()\n\n    def add_new_from(self, key: str, **kwargs):\n        \"\"\"Add validated & new categories.\n\n        Args:\n            key: The key referencing the slot in the DataFrame from which to draw terms.\n            **kwargs: Additional keyword arguments to pass to create new records\n        \"\"\"\n        if len(kwargs) > 0 and key == \"all\":\n            raise ValueError(\"Cannot pass additional arguments to 'all' key!\")\n        if key == \"all\":\n            logger.warning(\n                \"'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!\"\n            )\n            for k in self.non_validated.keys():\n                self._cat_vectors[k].add_new(**kwargs)\n        else:\n            self._cat_vectors[key].add_new(**kwargs)\n\n\ndef get_current_filter_kwargs(\n    registry: type[SQLRecord], kwargs: dict[str, str | SQLRecord]\n) -> dict:\n    \"\"\"Make sure the source and organism are saved in the same database as the registry.\"\"\"\n    db = registry.filter().db\n    filter_kwargs = kwargs.copy()\n\n    for key, value in kwargs.items():\n        if isinstance(value, SQLRecord) and value._state.db != \"default\":\n            if db is None or db == \"default\":\n                value_default = copy.copy(value)\n                value_default.save()\n                filter_kwargs[key] = value_default\n\n    return filter_kwargs\n\n\ndef annotate_artifact(\n    artifact: Artifact,\n    *,\n    curator: SlotsCurator | None = None,\n    cat_vectors: dict[str, CatVector] | None = None,\n) -> Artifact:\n    from .. import settings\n    from ..models.artifact import add_labels\n    from ..models.schema import ArtifactSchema\n\n    if cat_vectors is None:\n        cat_vectors = {}\n\n    # annotate with labels\n    for key, cat_vector in cat_vectors.items():\n        if (\n            cat_vector._registry == Feature\n            or key == \"columns\"\n            or key == \"var_index\"\n            or cat_vector.records is None\n        ):\n            continue\n        if len(cat_vector.records) > settings.annotation.n_max_records:\n            logger.important(\n                f\"not annotating with {len(cat_vector.records)} labels for feature {key} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)\"\n            )\n            continue\n        add_labels(\n            artifact,\n            records=cat_vector.records,\n            feature=cat_vector.feature,\n            from_curator=True,\n        )\n\n    # annotate with inferred schemas aka feature sets\n    if (\n        artifact.otype == \"DataFrame\" and getattr(curator, \"_schema\", None) is None\n    ):  # Prevent overwriting user-defined schemas that contain slots\n        features = cat_vectors[\"columns\"].records\n        if features is not None:\n            index_feature = artifact.schema.index\n            index_feature_id = None if index_feature is None else index_feature.id\n            feature_set = Schema(\n                features=[\n                    f\n                    for f in features\n                    if index_feature_id is None or f.id != index_feature_id\n                ],\n                itype=artifact.schema.itype,\n                index=index_feature,\n                minimal_set=artifact.schema.minimal_set,\n                maximal_set=artifact.schema.maximal_set,\n                coerce=artifact.schema.coerce,\n                ordered_set=artifact.schema.ordered_set,\n            )\n            if (\n                feature_set._state.adding\n                and len(features) > settings.annotation.n_max_records\n            ):\n                logger.important(\n                    f\"not annotating with {len(features)} features as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)\"\n                )\n                itype = (\n                    Feature.name\n                    if artifact.schema.itype == \"Composite\"  # backward compat\n                    else parse_cat_dtype(artifact.schema.itype, is_itype=True)[\"field\"]\n                )\n                feature_set = Schema(itype=itype, n_members=len(features))\n\n            ArtifactSchema.objects.update_or_create(\n                artifact=artifact,\n                slot=\"columns\",\n                defaults={\"schema\": feature_set.save()},\n            )\n\n    else:\n        for slot, slot_curator in curator._slots.items():\n            # var_index is backward compat (2025-05-01)\n            name = (\n                \"var_index\"\n                if (slot == \"var\" and \"var_index\" in slot_curator.cat._cat_vectors)\n                else \"columns\"\n            )\n            features = slot_curator.cat._cat_vectors[name].records\n            if features is None:\n                logger.warning(f\"no features found for slot {slot}\")\n                continue\n            validating_schema = slot_curator._schema\n            index_feature = validating_schema.index\n            index_feature_id = None if index_feature is None else index_feature.id\n            feature_set = Schema(\n                features=[\n                    f\n                    for f in features\n                    if index_feature_id is None or f.id != index_feature_id\n                ],\n                itype=validating_schema.itype,\n                index=index_feature,\n                minimal_set=validating_schema.minimal_set,\n                maximal_set=validating_schema.maximal_set,\n                coerce=validating_schema.coerce,\n                ordered_set=validating_schema.ordered_set,\n            )\n            if (\n                feature_set._state.adding\n                and len(features) > settings.annotation.n_max_records\n            ):\n                logger.important(\n                    f\"not annotating with {len(features)} features for slot {slot} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)\"\n                )\n                itype = (\n                    Feature.name\n                    if artifact.schema.slots[slot].itype\n                    == \"Composite\"  # backward compat\n                    else parse_cat_dtype(\n                        artifact.schema.slots[slot].itype, is_itype=True\n                    )[\"field\"]\n                )\n                feature_set = Schema(itype=itype, n_members=len(features))\n            ArtifactSchema.objects.update_or_create(\n                artifact=artifact, slot=slot, defaults={\"schema\": feature_set.save()}\n            )\n\n    slug = ln_setup.settings.instance.slug\n    if ln_setup.settings.instance.is_remote:  # pdagma: no cover\n        ui_url = ln_setup.settings.instance.ui_url\n        logger.important(f\"go to {ui_url}/{slug}/artifact/{artifact.uid}\")\n    return artifact\n\n\ndef _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]:\n    \"\"\"Flatten a Pandas series containing lists or single items into a unique list of elements.\n\n    The order of elements in the result list preserves the order they first appear in the input series.\n    \"\"\"\n    # Use dict.fromkeys to preserve order while ensuring uniqueness\n    result: dict = {}\n\n    for item in series:\n        if isinstance(item, list | np.ndarray):\n            # Add each element to the dict (only first occurrence is kept)\n            for element in item:\n                result[element] = None\n        else:\n            result[item] = None\n\n    # Return the keys as a list, preserving order\n    return list(result.keys())\n"
  },
  {
    "path": "lamindb/errors.py",
    "content": "\"\"\"Errors.\n\nDjango.\n\n.. autoexception:: ObjectDoesNotExist\n.. autoexception:: MultipleObjectsReturned\n\nLaminDB.\n\n.. autoexception:: ValidationError\n.. autoexception:: InvalidArgument\n.. autoexception:: NotebookNotSaved\n.. autoexception:: UnknownStorageLocation\n.. autoexception:: MissingContextUID\n.. autoexception:: UpdateContext\n.. autoexception:: IntegrityError\n.. autoexception:: FieldValidationError\n.. autoexception:: NoWriteAccess\n.. autoexception:: BlobHashNotFound\n.. autoexception:: FileNotInDevDir\n.. autoexception:: BranchAlreadyExists\n\n\"\"\"\n\n# -------------------------------------------------------------------------------------\n# Django\n# -------------------------------------------------------------------------------------\n\nfrom django.core.exceptions import (\n    MultipleObjectsReturned,  # noqa: F401\n    ObjectDoesNotExist,  # noqa: F401\n)\n\nObjectDoesNotExist.__doc__ = \"\"\"Object does not exist.\n\nThis is an alias for `django.core.exceptions.ObjectDoesNotExist`.\n\"\"\"\nDoesNotExist = ObjectDoesNotExist  # backward compat\n\nMultipleObjectsReturned.__doc__ = \"\"\"Multiple objects returned.\n\nThis is an alias for `django.core.exceptions.MultipleObjectsReturned`.\n\"\"\"\nMultipleResultsFound = MultipleObjectsReturned  # backward compat\n\n# -------------------------------------------------------------------------------------\n# lamindb\n# -------------------------------------------------------------------------------------\n\n\nclass ValidationError(Exception):\n    \"\"\"Validation error.\"\"\"\n\n    pass\n\n\nclass InvalidArgument(Exception):\n    \"\"\"Invalid method or function argument.\"\"\"\n\n    pass\n\n\nclass TrackNotCalled(Exception):\n    \"\"\"`ln.track()` wasn't called.\"\"\"\n\n    pass\n\n\nclass NotebookNotSaved(Exception):\n    \"\"\"Notebook wasn't saved.\"\"\"\n\n    pass\n\n\nclass UnknownStorageLocation(Exception):\n    \"\"\"Path is not contained in any known storage location.\"\"\"\n\n    pass\n\n\nclass NoStorageLocationForSpace(Exception):\n    \"\"\"No storage location found for space.\"\"\"\n\n    pass\n\n\nclass InconsistentKey(Exception):\n    \"\"\"Inconsistent transform or artifact `key`.\"\"\"\n\n    pass\n\n\nclass FieldValidationError(Exception):\n    \"\"\"Field validation error.\"\"\"\n\n    pass\n\n\n# -------------------------------------------------------------------------------------\n# run context\n# -------------------------------------------------------------------------------------\n\n\nclass IntegrityError(Exception):\n    \"\"\"Integrity error.\n\n    For instance, it's not allowed to delete artifacts outside managed storage\n    locations.\n    \"\"\"\n\n    pass\n\n\nclass MissingContextUID(Exception):\n    \"\"\"User didn't define transform settings.\"\"\"\n\n    pass\n\n\nclass UpdateContext(Exception):\n    \"\"\"Transform settings require update.\"\"\"\n\n    pass\n\n\nclass BlobHashNotFound(Exception):\n    \"\"\"Blob hash not found in git or storage.\"\"\"\n\n    pass\n\n\n# -------------------------------------------------------------------------------------\n# CRUD\n# -------------------------------------------------------------------------------------\n\n\nclass NoWriteAccess(Exception):\n    \"\"\"No write access to a space.\"\"\"\n\n    pass\n\n\nclass FileNotInDevDir(Exception):\n    \"\"\"File path is not within the configured dev directory.\"\"\"\n\n    pass\n\n\nclass BranchAlreadyExists(Exception):\n    \"\"\"Branch already exists.\n\n    Raised when creating a branch with `ln.setup.switch(..., create=True)` and\n    a branch with the given name or uid already exists. Consistent with `git switch -c`.\n    \"\"\"\n\n    pass\n"
  },
  {
    "path": "lamindb/examples/__init__.py",
    "content": "\"\"\"Examples.\n\n.. autosummary::\n   :toctree: .\n\n   schemas\n   datasets\n   cellxgene\n   croissant\n   mlflow\n   wandb\n\n\"\"\"\n\nfrom . import croissant, datasets, mlflow, schemas, wandb\nfrom .cellxgene import _cellxgene\n"
  },
  {
    "path": "lamindb/examples/cellxgene/__init__.py",
    "content": "\"\"\"CELLxGENE utilities.\n\n.. autofunction:: save_cellxgene_defaults\n.. autofunction:: create_cellxgene_schema\n\n\"\"\"\n\nfrom ._cellxgene import (\n    create_cellxgene_schema,\n    save_cellxgene_defaults,\n)\n"
  },
  {
    "path": "lamindb/examples/cellxgene/_cellxgene.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Collection, Literal, NamedTuple\n\nif TYPE_CHECKING:\n    from lamindb.base.types import FieldAttr\n    from lamindb.models import Registry, Schema\n\nCELLxGENEOrganisms = Literal[\n    \"human\",\n    \"mouse\",\n    \"zebra danio\",\n    \"rhesus macaquedomestic pig\",\n    \"chimpanzee\",\n    \"white-tufted-ear marmoset\",\n    \"sars-2\",\n]\nFieldType = Literal[\"ontology_id\", \"name\"]\n\n\ndef save_cellxgene_defaults() -> None:\n    \"\"\"Save default values of the CELLxGENE schema to the instance.\n\n    Adds CELLxGENE specific (control) values that are not available in the ontologies:\n\n    - \"normal\" Disease\n    - \"na\" Ethnicity\n    - \"unknown\" entries for DevelopmentalStage, Phenotype, and CellType\n    - \"tissue\", \"organoid\", \"primary cell culture\", and \"cell line\" ULabels (tissue_type)\n    - \"cell\", \"nucleus\", \"na\" ULabels (suspension_type)\n    \"\"\"\n    import bionty as bt\n\n    from lamindb.models import ULabel\n\n    # \"normal\" in Disease\n    normal = bt.Phenotype.from_source(\n        ontology_id=\"PATO:0000461\",\n        source=bt.Source.get(name=\"pato\", currently_used=True),\n    )\n    bt.Disease(\n        uid=normal.uid,\n        name=normal.name,\n        ontology_id=normal.ontology_id,\n        description=normal.description,\n        source=normal.source,  # not sure\n    ).save()\n\n    # na, unknown\n    for model, name in zip(\n        [\n            bt.Ethnicity,\n            bt.Ethnicity,\n            bt.DevelopmentalStage,\n            bt.Phenotype,\n            bt.CellType,\n        ],\n        [\"na\", \"unknown\", \"unknown\", \"unknown\", \"unknown\"],\n    ):\n        model(ontology_id=name, name=name, description=\"From CellxGene schema.\").save()\n\n    # tissue_type\n    tissue_type = ULabel(\n        name=\"TissueType\",\n        is_type=True,\n        description='From CellxGene schema. Is \"tissue\", \"organoid\", \"primary cell culture\", or \"cell line\".',\n    ).save()\n    for name in [\"tissue\", \"organoid\", \"primary cell culture\", \"cell line\"]:\n        ULabel(name=name, type=tissue_type, description=\"From CellxGene schema.\").save()\n\n    # suspension_type\n    suspension_type = ULabel(\n        name=\"SuspensionType\",\n        is_type=True,\n        description='From CellxGene schema. This MUST be \"cell\", \"nucleus\", or \"na\".',\n    ).save()\n    for name in [\"cell\", \"nucleus\", \"na\"]:\n        ULabel(\n            name=name, type=suspension_type, description=\"From CellxGene schema.\"\n        ).save()\n\n    # organisms\n    taxonomy_ids = [\n        \"NCBITaxon:9606\",  # Homo sapiens (Human)\n        \"NCBITaxon:10090\",  # Mus musculus (House mouse)\n        \"NCBITaxon:9544\",  # Macaca mulatta (Rhesus monkey)\n        \"NCBITaxon:9825\",  # Sus scrofa domesticus (Domestic pig)\n        \"NCBITaxon:9598\",  # Pan troglodytes (Chimpanzee)\n        \"NCBITaxon:9483\",  # Callithrix jacchus (White-tufted-ear marmoset)\n        \"NCBITaxon:7955\",  # Danio rerio (Zebrafish)\n    ]\n    for ontology_id in taxonomy_ids:\n        bt.Organism.from_source(\n            ontology_id=ontology_id,\n            source=bt.Source.get(name=\"ncbitaxon\", currently_used=True),\n        ).save()\n\n\ndef create_cellxgene_schema(\n    *,\n    field_types: FieldType | Collection[FieldType] = \"ontology_id\",\n    spatial_library_id: str | None = None,\n    organism: CELLxGENEOrganisms = \"human\",\n) -> Schema:\n    \"\"\"Generates a :class:`~lamindb.Schema` for a specific CELLxGENE schema version.\n\n    Args:\n        field_types: One or several of 'ontology_id', 'name'.\n        organism: The organism of the Schema.\n        library_id: Identifier for the spatial library.\n            Specifying this value enables curation against spatial requirements.\n    \"\"\"\n    import bionty as bt\n\n    from lamindb.models import Feature, Schema, ULabel\n\n    class CategorySpec(NamedTuple):\n        field: str | FieldAttr | list[Registry]\n        default: str | None\n        needs_organism: bool = False\n\n    categoricals_to_spec: dict[str, CategorySpec] = {\n        \"assay\": CategorySpec(bt.ExperimentalFactor.name, None, False),\n        \"assay_ontology_term_id\": CategorySpec(\n            bt.ExperimentalFactor.ontology_id, None, False\n        ),\n        \"cell_type\": CategorySpec(bt.CellType.name, \"unknown\", False),\n        \"cell_type_ontology_term_id\": CategorySpec(\n            bt.CellType.ontology_id, None, False\n        ),\n        \"development_stage\": CategorySpec(bt.DevelopmentalStage.name, \"unknown\", True),\n        \"development_stage_ontology_term_id\": CategorySpec(\n            bt.DevelopmentalStage.ontology_id, None, True\n        ),\n        \"disease\": CategorySpec(bt.Disease.name, \"normal\", False),\n        \"disease_ontology_term_id\": CategorySpec(bt.Disease.ontology_id, None, False),\n        \"self_reported_ethnicity\": CategorySpec(bt.Ethnicity.name, \"unknown\", False),\n        \"self_reported_ethnicity_ontology_term_id\": CategorySpec(\n            bt.Ethnicity.ontology_id, None, False\n        ),\n        \"sex\": CategorySpec(bt.Phenotype.name, \"unknown\", False),\n        \"sex_ontology_term_id\": CategorySpec(bt.Phenotype.ontology_id, None, False),\n        \"suspension_type\": CategorySpec(ULabel.name, \"cell\", False),\n        \"tissue\": CategorySpec(bt.Tissue.name, None, False),\n        \"tissue_ontology_term_id\": CategorySpec(\n            [bt.Tissue.ontology_id, bt.CellType.ontology_id], None, False\n        ),\n        \"tissue_type\": CategorySpec(ULabel.name, \"tissue\", False),\n        \"organism\": CategorySpec(bt.Organism.scientific_name, None, False),\n        \"organism_ontology_term_id\": CategorySpec(bt.Organism.ontology_id, None, False),\n        \"donor_id\": CategorySpec(str, \"unknown\", False),\n    }\n\n    def _get_source_cat_filters(\n        field: str | FieldAttr | type[Registry], *, needs_organism: bool | None = None\n    ) -> dict | None:\n        \"\"\"Some ontology are organism specific and their Features therefore need a `cat_filter`.\"\"\"\n        if isinstance(field, str) or not needs_organism:\n            return None\n        registry = field.field.model if hasattr(field, \"field\") else field\n        entity = f\"bionty.{registry.__name__}\"\n        filters = {\"entity\": entity, \"currently_used\": True}\n        if needs_organism:\n            filters[\"organism\"] = organism\n        return {\"source\": bt.Source.filter(**filters).one()}\n\n    field_types_set = (\n        {field_types} if isinstance(field_types, str) else set(field_types)\n    )\n    if field_types_set == {\"ontology_id\"}:\n        categoricals = {\n            k: v.field\n            for k, v in categoricals_to_spec.items()\n            if k.endswith(\"_ontology_term_id\") or k == \"donor_id\"\n        }\n    elif field_types_set == {\"name\"}:\n        categoricals = {\n            k: v.field\n            for k, v in categoricals_to_spec.items()\n            if not k.endswith(\"_ontology_term_id\") and k != \"donor_id\"\n        }\n    elif field_types_set == {\"name\", \"ontology_id\"}:\n        categoricals = {k: v.field for k, v in categoricals_to_spec.items()}\n    else:\n        raise ValueError(\n            f\"Invalid field_types: {field_types}. Must contain 'ontology_id', 'name', or both.\"\n        )\n\n    organism_fields = {\"organism\", \"organism_ontology_term_id\"}\n    obs_categoricals = {\n        k: v for k, v in categoricals.items() if k not in organism_fields\n    }\n\n    var_schema = Schema(\n        name=\"var of CELLxGENE\",\n        index=Feature(\n            name=\"var_index\",\n            dtype=bt.Gene.ensembl_gene_id,\n            cat_filters=_get_source_cat_filters(\n                bt.Gene.ensembl_gene_id, needs_organism=True\n            ),\n        ).save(),\n        itype=Feature,\n        features=[Feature(name=\"feature_is_filtered\", dtype=bool).save()],\n        dtype=\"DataFrame\",\n        coerce=True,\n    ).save()\n\n    obs_features = []\n    for field in obs_categoricals:\n        if field == \"var_index\":\n            continue\n        dtype = obs_categoricals[field]\n        needs_organism = categoricals_to_spec[field].needs_organism\n\n        cat_filters: dict | list[dict] | None\n        if isinstance(dtype, list):\n            cat_filters = (\n                [\n                    _get_source_cat_filters(d, needs_organism=needs_organism)\n                    for d in dtype\n                ]\n                if needs_organism\n                else None\n            )\n        elif not isinstance(dtype, str):\n            cat_filters = _get_source_cat_filters(dtype, needs_organism=needs_organism)\n        else:\n            cat_filters = None\n\n        obs_features.append(\n            Feature(  # type: ignore\n                name=field,\n                dtype=dtype,\n                default_value=categoricals_to_spec[field].default,\n                cat_filters=cat_filters,  # type: ignore\n            ).save()\n        )\n\n    for name in [\"is_primary_data\", \"suspension_type\", \"tissue_type\"]:\n        obs_features.append(Feature(name=name, dtype=ULabel.name).save())\n\n    obs_schema = Schema(\n        name=f\"obs of CELLxGENE of {field_types}\",\n        features=obs_features,\n        otype=\"DataFrame\",\n        minimal_set=True,\n        coerce=True,\n    ).save()\n\n    slots = {\"var\": var_schema, \"obs\": obs_schema}\n\n    uns_categoricals = {k: v for k, v in categoricals.items() if k in organism_fields}\n\n    uns_features = [\n        Feature(\n            name=field,\n            dtype=uns_categoricals[field],\n            default_value=categoricals_to_spec[field].default,\n        ).save()\n        for field in uns_categoricals\n    ]\n\n    uns_schema = Schema(\n        name=\"uns of CELLxGENE version\",\n        features=uns_features,\n        otype=\"DataFrame\",\n        minimal_set=True,\n        coerce=True,\n    ).save()\n\n    slots[\"uns\"] = uns_schema\n\n    # Add spatial validation if library_id is provided\n    if spatial_library_id:\n        scalefactors_schema = Schema(\n            name=f\"scalefactors of spatial {spatial_library_id}\",\n            features=[\n                Feature(name=\"spot_diameter_fullres\", dtype=float).save(),\n                Feature(name=\"tissue_hires_scalef\", dtype=float).save(),\n            ],\n        ).save()\n\n        spatial_schema = Schema(\n            name=\"CELLxGENE spatial metadata\",\n            features=[\n                Feature(\n                    name=\"is_single\",\n                    dtype=bool,\n                    description=\"True if dataset represents single spatial unit (tissue section for Visium, array for Slide-seqV2)\",\n                ).save()\n            ],\n        ).save()\n\n        slots[\"uns:spatial\"] = spatial_schema\n        slots[f\"uns:spatial:{spatial_library_id}:scalefactors\"] = scalefactors_schema\n\n    # Spatial library ID must be in the name\n    # Otherwise, we have lookup side effects where other existing Spatial Library IDs make it into the Schema\n    schema_name = f\"CELLxGENE AnnData of {', '.join(field_types) if isinstance(field_types, list) else field_types}\"\n    if spatial_library_id:\n        schema_name += f\" ({spatial_library_id})\"\n\n    full_cxg_schema = Schema(\n        name=schema_name,\n        otype=\"AnnData\",\n        minimal_set=True,\n        coerce=True,\n        slots=slots,\n    ).save()\n\n    return full_cxg_schema\n"
  },
  {
    "path": "lamindb/examples/croissant/__init__.py",
    "content": "\"\"\"Examples for MLCommons Croissant files, which are used to store metadata about datasets.\n\n.. autofunction:: mini_immuno\n\n\"\"\"\n\nimport json\nfrom pathlib import Path\n\n\ndef mini_immuno(\n    n_files: int = 1, filepath_prefix: str = \"\", strip_version: bool = False\n) -> list[Path]:\n    \"\"\"Return paths to the mini immuno dataset and its metadata as a Croissant file.\n\n    Args:\n        n_files: Number of files inside the croissant file.\n        filepath_prefix: Move the dataset and references to it in a specific directory.\n\n    Example\n\n        ::\n\n            croissant_path, dataset1_path = ln.examples.croissant.mini_immuno()\n            croissant_path, dataset1_path, dataset2_path = ln.examples.croissant.mini_immuno(n_files=2)\n    \"\"\"\n    from ..datasets import file_mini_csv\n    from ..datasets.mini_immuno import get_dataset1\n\n    adata = get_dataset1(otype=\"AnnData\")\n    if filepath_prefix:\n        dataset1_path = Path(filepath_prefix) / \"mini_immuno.anndata.zarr\"\n    else:\n        dataset1_path = Path(\"mini_immuno.anndata.zarr\")\n    adata.write_zarr(dataset1_path)\n    orig_croissant_path = (\n        Path(__file__).parent / \"mini_immuno.anndata.zarr_metadata.json\"\n    )\n    with open(orig_croissant_path, encoding=\"utf-8\") as f:\n        data = json.load(f)\n    if filepath_prefix:\n        assert data[\"distribution\"][0][\"@id\"] == \"mini_immuno.anndata.zarr\"  # noqa: S101\n        data[\"distribution\"][0][\"@id\"] = str(Path(filepath_prefix) / dataset1_path.name)\n    if strip_version:\n        data.pop(\"version\", None)\n    if n_files == 2:\n        file_mini_csv()\n        if filepath_prefix:\n            dataset2_path = Path(filepath_prefix) / \"mini.csv\"\n        else:\n            dataset2_path = Path(\"mini.csv\")\n        data[\"distribution\"].append(\n            {\n                \"@type\": \"sc:FileObject\",\n                \"@id\": dataset2_path.as_posix(),\n                \"name\": \"mini.csv\",\n                \"encodingFormat\": \"text/csv\",\n            }\n        )\n    croissant_path = Path(\"mini_immuno.anndata.zarr_metadata.json\")\n    with open(croissant_path, \"w\", encoding=\"utf-8\") as f:\n        json.dump(data, f, indent=2)\n\n    result: list[Path] = [croissant_path, dataset1_path]\n    if n_files == 1:\n        return result\n    result.append(dataset2_path)\n\n    return result\n"
  },
  {
    "path": "lamindb/examples/croissant/mini_immuno.anndata.zarr_metadata.json",
    "content": "{\n  \"@context\": {\n    \"@vocab\": \"https://schema.org/\",\n    \"cr\": \"https://mlcommons.org/croissant/\",\n    \"ml\": \"http://ml-schema.org/\",\n    \"sc\": \"https://schema.org/\",\n    \"dct\": \"http://purl.org/dc/terms/\",\n    \"data\": \"https://mlcommons.org/croissant/data/\",\n    \"rai\": \"https://mlcommons.org/croissant/rai/\",\n    \"format\": \"https://mlcommons.org/croissant/format/\",\n    \"citeAs\": \"https://mlcommons.org/croissant/citeAs/\",\n    \"conformsTo\": \"https://mlcommons.org/croissant/conformsTo/\",\n    \"@language\": \"en\",\n    \"repeated\": \"https://mlcommons.org/croissant/repeated/\",\n    \"field\": \"https://mlcommons.org/croissant/field/\",\n    \"examples\": \"https://mlcommons.org/croissant/examples/\",\n    \"recordSet\": \"https://mlcommons.org/croissant/recordSet/\",\n    \"fileObject\": \"https://mlcommons.org/croissant/fileObject/\",\n    \"fileSet\": \"https://mlcommons.org/croissant/fileSet/\",\n    \"source\": \"https://mlcommons.org/croissant/source/\",\n    \"references\": \"https://mlcommons.org/croissant/references/\",\n    \"key\": \"https://mlcommons.org/croissant/key/\",\n    \"parentField\": \"https://mlcommons.org/croissant/parentField/\",\n    \"isLiveDataset\": \"https://mlcommons.org/croissant/isLiveDataset/\",\n    \"separator\": \"https://mlcommons.org/croissant/separator/\",\n    \"extract\": \"https://mlcommons.org/croissant/extract/\",\n    \"subField\": \"https://mlcommons.org/croissant/subField/\",\n    \"regex\": \"https://mlcommons.org/croissant/regex/\",\n    \"column\": \"https://mlcommons.org/croissant/column/\",\n    \"path\": \"https://mlcommons.org/croissant/path/\",\n    \"fileProperty\": \"https://mlcommons.org/croissant/fileProperty/\",\n    \"md5\": \"https://mlcommons.org/croissant/md5/\",\n    \"jsonPath\": \"https://mlcommons.org/croissant/jsonPath/\",\n    \"transform\": \"https://mlcommons.org/croissant/transform/\",\n    \"replace\": \"https://mlcommons.org/croissant/replace/\",\n    \"dataType\": \"https://mlcommons.org/croissant/dataType/\",\n    \"includes\": \"https://mlcommons.org/croissant/includes/\",\n    \"excludes\": \"https://mlcommons.org/croissant/excludes/\"\n  },\n  \"@type\": \"Dataset\",\n  \"name\": \"Mini immuno dataset\",\n  \"description\": \"A few samples from the immunology dataset\",\n  \"url\": \"https://lamin.ai/laminlabs/lamindata/artifact/tCUkRcaEjTjhtozp0000\",\n  \"creator\": {\n    \"@type\": \"Person\",\n    \"name\": \"falexwolf\"\n  },\n  \"dateCreated\": \"2025-07-16\",\n  \"cr:projectName\": \"Mini Immuno Project\",\n  \"datePublished\": \"2025-07-16\",\n  \"version\": \"1.0\",\n  \"license\": \"https://creativecommons.org/licenses/by/4.0/\",\n  \"citation\": \"Please cite this dataset as: mini immuno (2025)\",\n  \"encodingFormat\": \"zarr\",\n  \"distribution\": [\n    {\n      \"@type\": \"cr:FileSet\",\n      \"@id\": \"mini_immuno.anndata.zarr\",\n      \"containedIn\": {\n        \"@id\": \"directory\"\n      },\n      \"encodingFormat\": \"zarr\"\n    }\n  ],\n  \"cr:recordSet\": [\n    {\n      \"@type\": \"cr:RecordSet\",\n      \"@id\": \"#samples\",\n      \"name\": \"samples\",\n      \"description\": \"my sample\"\n    }\n  ]\n}\n"
  },
  {
    "path": "lamindb/examples/datasets/__init__.py",
    "content": "\"\"\"Example datasets.\n\nThe mini immuno dataset\n-----------------------\n\n.. autosummary::\n   :toctree: .\n\n   mini_immuno\n\nSmall in-memory datasets\n------------------------\n\n.. autofunction:: anndata_with_obs\n\nFiles\n-----\n\n.. autofunction:: file_fcs\n.. autofunction:: file_fcs_alpert19\n.. autofunction:: file_tsv_rnaseq_nfcore_salmon_merged_gene_counts\n.. autofunction:: file_jpg_paradisi05\n.. autofunction:: file_tiff_suo22\n.. autofunction:: file_fastq\n.. autofunction:: file_bam\n.. autofunction:: file_mini_csv\n\nDirectories\n-----------\n\n.. autofunction:: dir_scrnaseq_cellranger\n.. autofunction:: dir_iris_images\n\nDictionary, Dataframe, AnnData, MuData, SpatialData\n----------------------------------------------------\n\n.. autofunction:: dict_cellxgene_uns\n.. autofunction:: df_iris\n.. autofunction:: df_iris_in_meter\n.. autofunction:: df_iris_in_meter_study1\n.. autofunction:: df_iris_in_meter_study2\n.. autofunction:: anndata_mouse_sc_lymph_node\n.. autofunction:: anndata_human_immune_cells\n.. autofunction:: anndata_pbmc68k_reduced\n.. autofunction:: anndata_file_pbmc68k_test\n.. autofunction:: anndata_pbmc3k_processed\n.. autofunction:: anndata_suo22_Visium10X\n.. autofunction:: anndata_visium_mouse_cellxgene\n.. autofunction:: mudata_papalexi21_subset\n.. autofunction:: schmidt22_crispra_gws_IFNG\n.. autofunction:: schmidt22_perturbseq\n.. autofunction:: spatialdata_blobs\n\n\nOther\n-----\n\n.. autofunction:: fake_bio_notebook_titles\n\"\"\"\n\nimport importlib.util\nimport sys\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from . import mini_immuno\n    from ._core import (\n        anndata_file_pbmc68k_test,\n        anndata_human_immune_cells,\n        anndata_mouse_sc_lymph_node,\n        anndata_pbmc3k_processed,\n        anndata_pbmc68k_reduced,\n        anndata_suo22_Visium10X,\n        anndata_visium_mouse_cellxgene,\n        df_iris,\n        df_iris_in_meter,\n        df_iris_in_meter_study1,\n        df_iris_in_meter_study2,\n        dict_cellxgene_uns,\n        dir_iris_images,\n        dir_scrnaseq_cellranger,\n        file_bam,\n        file_fastq,\n        file_fcs,\n        file_fcs_alpert19,\n        file_jpg_paradisi05,\n        file_mini_csv,\n        file_tiff_suo22,\n        file_tsv_rnaseq_nfcore_salmon_merged_gene_counts,\n        mudata_papalexi21_subset,\n        schmidt22_crispra_gws_IFNG,\n        schmidt22_perturbseq,\n        spatialdata_blobs,\n    )\n    from ._fake import fake_bio_notebook_titles\n    from ._small import anndata_with_obs, small_dataset3_cellxgene\n    from .mini_immuno import get_dataset1 as small_dataset1\n    from .mini_immuno import get_dataset2 as small_dataset2\n\n\ndef __getattr__(name: str):\n    \"\"\"Lazy-import datasets to avoid loading pandas/anndata at package import.\"\"\"\n    if name == \"mini_immuno\":\n        # Use importlib to avoid __getattr__ recursion when importing submodule\n        spec = importlib.util.find_spec(\n            \"lamindb.examples.datasets.mini_immuno\",\n            package=\"lamindb.examples.datasets\",\n        )\n        if spec is None or spec.loader is None:\n            raise ImportError(\"Could not find module mini_immuno\")\n        module = importlib.util.module_from_spec(spec)\n        sys.modules[\"lamindb.examples.datasets.mini_immuno\"] = module\n        spec.loader.exec_module(module)\n        return module\n    if name in (\"small_dataset1\", \"small_dataset2\"):\n        mini_immuno = importlib.import_module(\n            \".mini_immuno\", package=\"lamindb.examples.datasets\"\n        )\n        return (\n            mini_immuno.get_dataset1\n            if name == \"small_dataset1\"\n            else mini_immuno.get_dataset2\n        )\n    _core_names = (\n        \"anndata_file_pbmc68k_test\",\n        \"anndata_human_immune_cells\",\n        \"anndata_mouse_sc_lymph_node\",\n        \"anndata_pbmc3k_processed\",\n        \"anndata_pbmc68k_reduced\",\n        \"anndata_suo22_Visium10X\",\n        \"df_iris\",\n        \"df_iris_in_meter\",\n        \"df_iris_in_meter_study1\",\n        \"df_iris_in_meter_study2\",\n        \"dict_cellxgene_uns\",\n        \"dir_iris_images\",\n        \"dir_scrnaseq_cellranger\",\n        \"file_bam\",\n        \"file_fastq\",\n        \"file_fcs\",\n        \"file_fcs_alpert19\",\n        \"file_jpg_paradisi05\",\n        \"file_mini_csv\",\n        \"file_tiff_suo22\",\n        \"file_tsv_rnaseq_nfcore_salmon_merged_gene_counts\",\n        \"mudata_papalexi21_subset\",\n        \"schmidt22_crispra_gws_IFNG\",\n        \"schmidt22_perturbseq\",\n        \"spatialdata_blobs\",\n        \"anndata_visium_mouse_cellxgene\",\n    )\n    if name in _core_names:\n        _core = importlib.import_module(\"._core\", package=\"lamindb.examples.datasets\")\n        return getattr(_core, name)\n    if name in (\"anndata_with_obs\", \"small_dataset3_cellxgene\"):\n        _small = importlib.import_module(\"._small\", package=\"lamindb.examples.datasets\")\n        return getattr(_small, name)\n    if name == \"fake_bio_notebook_titles\":\n        _fake = importlib.import_module(\"._fake\", package=\"lamindb.examples.datasets\")\n        return _fake.fake_bio_notebook_titles\n    raise AttributeError(f\"module {__name__!r} has no attribute {name!r}\")\n\n\n__all__ = [\n    \"mini_immuno\",\n    \"small_dataset1\",\n    \"small_dataset2\",\n    \"small_dataset3_cellxgene\",\n    \"anndata_with_obs\",\n    \"anndata_file_pbmc68k_test\",\n    \"anndata_human_immune_cells\",\n    \"anndata_mouse_sc_lymph_node\",\n    \"anndata_pbmc3k_processed\",\n    \"anndata_pbmc68k_reduced\",\n    \"anndata_suo22_Visium10X\",\n    \"anndata_visium_mouse_cellxgene\",\n    \"df_iris\",\n    \"df_iris_in_meter\",\n    \"df_iris_in_meter_study1\",\n    \"df_iris_in_meter_study2\",\n    \"dict_cellxgene_uns\",\n    \"dir_iris_images\",\n    \"dir_scrnaseq_cellranger\",\n    \"fake_bio_notebook_titles\",\n    \"file_bam\",\n    \"file_fastq\",\n    \"file_fcs\",\n    \"file_fcs_alpert19\",\n    \"file_jpg_paradisi05\",\n    \"file_mini_csv\",\n    \"file_tiff_suo22\",\n    \"file_tsv_rnaseq_nfcore_salmon_merged_gene_counts\",\n    \"mudata_papalexi21_subset\",\n    \"schmidt22_crispra_gws_IFNG\",\n    \"schmidt22_perturbseq\",\n    \"spatialdata_blobs\",\n]\n"
  },
  {
    "path": "lamindb/examples/datasets/_core.py",
    "content": "from __future__ import annotations\n\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any\nfrom urllib.request import urlretrieve\n\nimport anndata as ad\nimport pandas as pd\nfrom upath import UPath\n\nfrom lamindb.base.uids import base62\nfrom lamindb.core._settings import settings\n\nif TYPE_CHECKING:\n    from mudata import MuData\n    from spatialdata import SpatialData\n\n\ndef file_fcs() -> Path:\n    \"\"\"Example FCS artifact.\"\"\"\n    filepath, _ = urlretrieve(\n        \"https://lamindb-dev-datasets.s3.amazonaws.com/.lamindb/DBNEczSgBui0bbzBXMGH.fcs\",\n        \"example.fcs\",\n    )\n    return Path(filepath)\n\n\ndef file_fcs_alpert19(populate_registries: bool = False) -> Path:\n    \"\"\"FCS file from Alpert19.\n\n    Args:\n        populate_registries: pre-populate metadata records to simulate existing registries  # noqa\n    \"\"\"\n    filepath, _ = urlretrieve(\n        \"https://lamindb-test.s3.amazonaws.com/Alpert19-070314-Mike-Study+15-2013-plate+1-15-004-1-13_cells_found.fcs\",\n        \"Alpert19.fcs\",\n    )\n    if populate_registries:\n        import bionty as bt\n        import readfcs\n\n        import lamindb as ln\n\n        verbosity = ln.settings.verbosity\n        ln.settings.verbosity = \"error\"\n        adata = readfcs.read(filepath)\n        std = bt.CellMarker.public().standardize(adata.var.index)\n        ln.save(\n            bt.CellMarker.from_values(\n                bt.CellMarker.public().inspect(std, \"name\").validated, \"name\"\n            )\n        )\n        ln.Feature(name=\"assay\", dtype=[bt.ExperimentalFactor]).save()  # type: ignore\n        ln.Feature(name=\"organism\", dtype=[bt.Organism]).save()  # type: ignore\n        ln.settings.verbosity = verbosity\n    return Path(filepath)\n\n\ndef file_jpg_paradisi05() -> Path:\n    \"\"\"JPG file example.\n\n    Originally from: https://upload.wikimedia.org/wikipedia/commons/2/28/Laminopathic_nuclei.jpg\n    \"\"\"\n    filepath, _ = urlretrieve(\n        \"https://lamindb-test.s3.amazonaws.com/Laminopathic_nuclei.jpg\",\n        \"paradisi05_laminopathic_nuclei.jpg\",\n    )\n    return Path(filepath)\n\n\ndef file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(\n    populate_registries: bool = False,\n) -> Path:\n    \"\"\"Gene counts table from nf-core RNA-seq pipeline.\n\n    Output of: https://nf-co.re/rnaseq\n    \"\"\"\n    filepath, _ = urlretrieve(\n        \"https://lamindb-test.s3.amazonaws.com/salmon.merged.gene_counts.tsv\",\n        \"salmon.merged.gene_counts.tsv\",\n    )\n    if populate_registries:\n        import bionty as bt\n\n        import lamindb as ln\n\n        verbosity = ln.settings.verbosity\n        ln.settings.verbosity = \"error\"\n        ln.Feature(name=\"assay\", dtype=[bt.ExperimentalFactor]).save()  # type: ignore\n        ln.Feature(name=\"organism\", dtype=[bt.Organism]).save()  # type: ignore\n        bt.ExperimentalFactor.from_source(ontology_id=\"EFO:0008896\").save()\n        ln.settings.verbosity = verbosity\n\n    return Path(filepath)\n\n\ndef file_fastq(in_storage_root=False) -> Path:\n    \"\"\"Mini mock fastq artifact.\"\"\"\n    basedir = Path() if not in_storage_root else settings.storage.root\n    filepath = basedir / \"input.fastq.gz\"\n    with open(filepath, \"w\") as f:\n        f.write(\"Mock fastq artifact.\")\n    return filepath\n\n\ndef file_bam(in_storage_root=False) -> Path:\n    \"\"\"Mini mock bam artifact.\"\"\"\n    basedir = Path() if not in_storage_root else settings.storage.root\n    filepath = basedir / \"output.bam\"\n    with open(filepath, \"w\") as f:\n        f.write(\"Mock bam artifact.\")\n    return filepath\n\n\ndef file_mini_csv(in_storage_root=False) -> Path:\n    \"\"\"Mini csv artifact.\"\"\"\n    basedir = Path() if not in_storage_root else settings.storage.root\n    filepath = basedir / \"mini.csv\"\n    df = pd.DataFrame([1, 2, 3], columns=[\"test\"])\n    df.to_csv(filepath, index=False)\n    return filepath\n\n\ndef file_tiff_suo22() -> Path:\n    \"\"\"Image file from Suo22.\n\n    Pair with anndata_suo22_Visium10X\n    \"\"\"\n    filepath, _ = urlretrieve(\n        \"https://lamindb-test.s3.amazonaws.com/F121_LP1_4LIV.tiff\",\n        \"F121_LP1_4LIV.tiff\",\n    )\n    Path(\"suo22/\").mkdir(exist_ok=True)\n    filepath = Path(filepath).rename(\"suo22/F121_LP1_4LIV.tiff\")  # type: ignore\n    return Path(filepath)\n\n\ndef dir_iris_images() -> UPath:\n    \"\"\"Directory with 3 studies of the Iris flower: 405 images & metadata.\n\n    Provenance: https://lamin.ai/laminlabs/lamindata/transform/3q4MpQxRL2qZ5zKv\n\n    The problem is that the same artifact was also ingested by the downstream demo notebook:\n    https://lamin.ai/laminlabs/lamindata/transform/NJvdsWWbJlZS5zKv\n\n    This is why on the UI, the artifact shows up as output of the downstream\n    demo notebook rather than the upstream curation notebook.\n    The lineage information should still be captured by\n    https://github.com/laminlabs/lnschema-core/blob/a90437e91dfbd6b9002f18c3e978bd0f9c9a632d/lamindb/models.py#L2050-L2052\n    but we don't use this in the UI yet.\n    \"\"\"\n    return UPath(\"s3://lamindata/iris_studies\")\n\n\ndef anndata_mouse_sc_lymph_node(\n    populate_registries: bool = False,\n) -> ad.AnnData:\n    \"\"\"Mouse lymph node scRNA-seq collection from EBI.\n\n    Subsampled to 10k genes.\n\n    From: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-8414/\n\n    Args:\n        populate_registries: pre-populate metadata records to simulate existing registries  # noqa\n    \"\"\"\n    filepath, _ = urlretrieve(\"https://lamindb-test.s3.amazonaws.com/E-MTAB-8414.h5ad\")\n    adata = ad.read_h5ad(filepath)\n\n    # The column names are a bit lengthy, let's abbreviate them:\n    adata.obs.columns = (\n        adata.obs.columns.str.replace(\"Sample Characteristic\", \"\")\n        .str.replace(\"Factor Value \", \"Factor Value:\", regex=True)\n        .str.replace(\"Factor Value\\\\[\", \"Factor Value:\", regex=True)\n        .str.replace(\" Ontology Term\\\\[\", \"ontology_id:\", regex=True)\n        .str.strip(\"[]\")\n        .str.replace(\"organism part\", \"tissue\")\n        .str.replace(\"organism\", \"organism\")\n        .str.replace(\"developmental stage\", \"developmental_stage\")\n        .str.replace(\"cell type\", \"cell_type\")\n        # the last one could be interesting, too\n        # .str.replace(\"Factor Value:Ontology Term[inferred cell_type - authors labels\", \"cell_type_authors\")\n    )\n    # subset columns to only the ones with names\n    columns = [\n        col\n        for col in adata.obs.columns\n        if not col.startswith(\"ontology_id\")\n        and not col.startswith(\"Factor Value\")\n        and col != \"strain\"\n    ]\n    adata.obs = adata.obs[columns]\n\n    # pre-populate registries\n    if populate_registries:\n        import bionty as bt\n\n        import lamindb as ln\n\n        verbosity = ln.settings.verbosity\n        ln.settings.verbosity = \"error\"\n        # strain\n        bt.ExperimentalFactor.from_source(ontology_id=\"EFO:0004472\").save()\n        # developmental stage\n        bt.ExperimentalFactor.from_source(ontology_id=\"EFO:0001272\").save()\n        # tissue\n        bt.Tissue.from_source(ontology_id=\"UBERON:0001542\").save()\n        # cell types\n        ln.save(bt.CellType.from_values([\"CL:0000115\", \"CL:0000738\"], \"ontology_id\"))\n        # assays\n        ln.Feature(name=\"assay\", dtype=[bt.ExperimentalFactor]).save()  # type: ignore\n        bt.ExperimentalFactor.from_source(ontology_id=\"EFO:0008913\").save()\n        # genes\n        validated = bt.Gene.public(organism=\"mouse\").validate(\n            adata.var.index, field=\"ensembl_gene_id\"\n        )\n        ln.save(\n            bt.Gene.from_values(\n                adata.var.index[validated][:-19],\n                field=\"ensembl_gene_id\",\n                organism=\"mouse\",\n            )\n        )\n        # labels\n        labels = []\n        for col in [\"sex\", \"age\", \"genotype\", \"immunophenotype\"]:\n            labels += [ln.ULabel(name=name) for name in adata.obs[col]]\n        ln.save(labels)\n        ln.settings.verbosity = verbosity\n\n    return adata\n\n\ndef anndata_pbmc68k_reduced() -> ad.AnnData:\n    \"\"\"Modified from scanpy.collections.pbmc68k_reduced().\n\n    This code was run::\n\n        pbmc68k = sc.collections.pbmc68k_reduced()\n        pbmc68k.obs.rename(columns={\"bulk_labels\": \"cell_type\"}, inplace=True)\n        pbmc68k.obs[\"cell_type\"] = pbmc68k.obs[\"cell_type\"].cat.rename_categories(\n            {\"Dendritic\": \"Dendritic cells\", \"CD14+ Monocyte\": \"CD14+ Monocytes\"}\n        )\n        del pbmc68k.obs[\"G2M_score\"]\n        del pbmc68k.obs[\"S_score\"]\n        del pbmc68k.obs[\"phase\"]\n        del pbmc68k.obs[\"n_counts\"]\n        del pbmc68k.var[\"dispersions\"]\n        del pbmc68k.var[\"dispersions_norm\"]\n        del pbmc68k.var[\"means\"]\n        del pbmc68k.uns[\"rank_genes_groups\"]\n        del pbmc68k.uns[\"bulk_labels_colors\"]\n        sc.pp.subsample(pbmc68k, fraction=0.1, random_state=123)\n        pbmc68k.write(\"scrnaseq_pbmc68k_tiny.h5ad\")\n    \"\"\"\n    filepath, _ = urlretrieve(\n        \"https://lamindb-dev-datasets.s3.amazonaws.com/scrnaseq_pbmc68k_tiny.h5ad\"\n    )\n    return ad.read_h5ad(filepath)\n\n\ndef anndata_file_pbmc68k_test() -> Path:\n    \"\"\"Modified from scanpy.collections.pbmc68k_reduced().\n\n    Additional slots were added for testing purposes. Returns the filepath.\n\n    To reproduce::\n\n        pbmc68k = ln.examples.datasets.anndata_pbmc68k_reduced()\n        pbmc68k_test = pbmc68k[:30, :200].copy()\n        pbmc68k_test.raw = pbmc68k_test[:, :100]\n        pbmc68k_test.obsp[\"test\"] = sparse.eye(pbmc68k_test.shape[0], format=\"csr\")\n        pbmc68k_test.varp[\"test\"] = sparse.eye(pbmc68k_test.shape[1], format=\"csr\")\n        pbmc68k_test.layers[\"test\"] = sparse.csr_matrix(pbmc68k_test.shape)\n        pbmc68k_test.layers[\"test\"][0] = 1.\n        pbmc68k_test.write(\"pbmc68k_test.h5ad\")\n    \"\"\"\n    filepath, _ = urlretrieve(\n        \"https://lamindb-test.s3.amazonaws.com/pbmc68k_test.h5ad\", \"pbmc68k_test.h5ad\"\n    )\n    return Path(filepath)\n\n\ndef anndata_pbmc3k_processed() -> ad.AnnData:\n    \"\"\"Modified from scanpy.pbmc3k_processed().\"\"\"\n    filepath, _ = urlretrieve(\n        \"https://lamindb-test.s3.amazonaws.com/scrnaseq_scanpy_pbmc3k_processed.h5ad\"\n    )\n    pbmc3k = ad.read_h5ad(filepath)\n    pbmc3k.obs.rename(columns={\"louvain\": \"cell_type\"}, inplace=True)\n    return pbmc3k\n\n\ndef anndata_human_immune_cells(\n    populate_registries: bool = False,\n) -> ad.AnnData:\n    \"\"\"Cross-tissue immune cell analysis reveals tissue-specific features in humans.\n\n    From: https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3\n    Collection: Global\n\n    To reproduce the subsample::\n        >>> adata = sc.read('Global.h5ad')\n        >>> adata.obs = adata.obs[['donor_id', 'tissue', 'cell_type', 'assay', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id']].copy()\n        >>> sc.pp.subsample(adata, fraction=0.005)\n        >>> del adata.uns[\"development_cache_ontology_term_id_colors\"]\n        >>> del adata.uns[\"sex_ontology_term_id_colors\"]\n        >>> adata.write('human_immune.h5ad')\n    \"\"\"\n    filepath, _ = urlretrieve(\"https://lamindb-test.s3.amazonaws.com/human_immune.h5ad\")\n    adata = ad.read_h5ad(filepath)\n    adata.var.drop(columns=[\"gene_symbols\", \"feature_name\"], inplace=True)\n    adata.uns.pop(\"cell_type_ontology_term_id_colors\")\n    adata.uns.pop(\"title\")\n    adata.uns.pop(\"schema_version\")\n    adata.obs.columns = adata.obs.columns.str.replace(\"donor_id\", \"donor\")\n    columns = [col for col in adata.obs.columns if \"ontology_term\" not in col]\n    adata.obs = adata.obs[columns]\n    if populate_registries:\n        import bionty as bt\n\n        import lamindb as ln\n\n        ln.save(\n            bt.Gene.from_values(\n                adata.var.index, field=\"ensembl_gene_id\", organism=\"human\"\n            )\n        )\n        ln.save(bt.CellType.from_values(adata.obs.cell_type, field=\"name\"))\n        ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field=\"name\"))\n        ln.save(bt.Tissue.from_values(adata.obs.tissue, field=\"name\"))\n        ln.Feature(name=\"cell_type\", dtype=[bt.CellType]).save()  # type: ignore\n        ln.Feature(name=\"assay\", dtype=[bt.ExperimentalFactor]).save()  # type: ignore\n        ln.Feature(name=\"tissue\", dtype=[bt.Tissue]).save()  # type: ignore\n        ln.Feature(name=\"organism\", dtype=[bt.Organism]).save()  # type: ignore\n        ln.Feature(name=\"donor\", dtype=[ln.ULabel]).save()  # type: ignore\n        bt.ExperimentalFactor.from_source(ontology_id=\"EFO:0008913\").save()\n        ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])\n    return adata\n\n\ndef anndata_suo22_Visium10X():\n    \"\"\"AnnData from Suo22 generated by 10x Visium.\"\"\"\n    import anndata as ad\n\n    filepath, _ = urlretrieve(\n        \"https://lamindb-test.s3.amazonaws.com/suo22_Visium10X_data_LI_subset.h5ad\",\n        \"Visium10X_data_LI_subset.h5ad\",\n    )\n    Path(\"suo22/\").mkdir(exist_ok=True)\n    filepath = Path(filepath).rename(\"suo22/Visium10X_data_LI_subset.h5ad\")\n    return ad.read_h5ad(filepath)\n\n\ndef mudata_papalexi21_subset(with_uns: bool = False) -> MuData:\n    \"\"\"A subsetted MuData from papalexi21.\n\n    To reproduce the subsetting:\n        >>> !wget https://figshare.com/ndownloader/files/36509460\n        >>> import mudata as md\n        >>> import scanpy as sc\n        >>> mdata = md.read_h5mu(\"36509460\")\n        >>> mdata = sc.pp.subsample(mdata, n_obs=200, copy=True)[0]\n        >>> mdata[:, -300:].copy().write(\"papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu\")\n    \"\"\"\n    import mudata as md\n\n    md.set_options(pull_on_update=False)\n\n    filepath, _ = urlretrieve(\n        \"https://lamindb-test.s3.amazonaws.com/papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu\",\n        \"papalexi21_subset.h5mu\",\n    )\n\n    mdata = md.read_h5mu(filepath)\n\n    mdata.pull_obs()\n\n    # The MuData object is malformed with duplicated information\n    # Drop all columns for the modalities and add them again correspondingly\n    for mod in [\"rna\", \"adt\", \"hto\", \"gdo\"]:\n        mdata[mod].obs.drop(mdata[mod].obs.columns, axis=1, inplace=True)\n    for col in mdata.obs.columns:\n        for mod in [\"rna\", \"adt\", \"hto\", \"gdo\"]:\n            if col.endswith(f\"_{mod.upper()}\"):\n                new_col = col.replace(f\"{mod}:\", \"\")\n                if new_col != col:\n                    mdata[mod].obs[new_col] = mdata.obs.pop(col)\n            else:\n                new_col = col.replace(f\"{mod}:\", \"\")\n                if new_col not in mdata.obs.columns and col in mdata.obs.columns:\n                    mdata.obs[new_col] = mdata.obs.pop(col)\n\n    for col in mdata.obs.columns:\n        for mod in [\"rna\", \"adt\", \"hto\", \"gdo\"]:\n            if col.endswith(f\"_{mod.upper()}\"):\n                del mdata.obs[col]\n\n    for col in [\n        \"orig.ident\",\n        \"MULTI_ID\",\n        \"NT\",\n        \"S.Score\",\n        \"G2M.Score\",\n        \"Phase\",\n        \"gene_target\",\n        \"guide_ID\",\n        \"HTO_classification\",\n    ]:\n        del mdata.obs[col]\n\n    mdata.push_obs([\"percent.mito\"], mods=[\"rna\"], drop=True)\n    mdata[\"hto\"].obs[\"technique\"] = \"cell hashing\"\n    mdata[\"hto\"].obs[\"technique\"] = mdata[\"hto\"].obs[\"technique\"].astype(\"category\")\n    mdata.pull_obs([\"technique\"], mods=\"hto\")\n\n    if with_uns:\n        mdata.uns[\"study_metadata\"] = {\n            \"temperature\": 21.6,\n            \"experiment\": \"Experiment 1\",\n        }\n        mdata[\"rna\"].uns[\"site_metadata\"] = {\"pos\": 99.9, \"site_id\": \"SITE001\"}\n\n    return mdata\n\n\ndef dict_cellxgene_uns() -> dict[str, Any]:\n    \"\"\"An example CELLxGENE AnnData `.uns` dictionary.\"\"\"\n    uns = {\n        \"organism_ontology_term_id\": \"NCBITaxon:9606\",\n        \"spatial\": {\n            \"is_single\": True,\n            \"library_1\": {  # Dynamic library_id key\n                \"images\": {\n                    \"fullres\": \"path/to/fullres.jpg\",\n                    \"hires\": \"path/to/hires.jpg\",\n                },\n                \"scalefactors\": {\n                    \"spot_diameter_fullres\": 89.43,\n                    \"tissue_hires_scalef\": 0.177,\n                },\n            },\n            \"library_2\": {  # Another dynamic library_id key\n                \"images\": {\n                    \"fullres\": \"path/to/fullres_2.jpg\",\n                    \"hires\": \"path/to/hires_2.jpg\",\n                },\n                \"scalefactors\": {\n                    \"spot_diameter_fullres\": 120.34,\n                    \"tissue_hires_scalef\": 0.355,\n                },\n            },\n        },\n    }\n\n    return uns\n\n\ndef df_iris() -> pd.DataFrame:\n    \"\"\"The iris collection as in sklearn.\n\n    Original code::\n\n        sklearn.collections.load_iris(as_frame=True).frame\n    \"\"\"\n    filepath, _ = urlretrieve(\"https://lamindb-test.s3.amazonaws.com/iris.parquet\")\n    return pd.read_parquet(filepath)\n\n\ndef df_iris_in_meter() -> pd.DataFrame:\n    \"\"\"The iris collection with lengths in meter.\"\"\"\n    df = df_iris()\n    # rename columns\n    df.rename(\n        columns={\n            \"sepal length (cm)\": \"sepal_length\",\n            \"sepal width (cm)\": \"sepal_width\",\n            \"petal length (cm)\": \"petal_length\",\n            \"petal width (cm)\": \"petal_width\",\n        },\n        inplace=True,\n    )\n    df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]] /= 100\n    df[\"iris_organism_name\"] = df[\"target\"].map(\n        {0: \"setosa\", 1: \"versicolor\", 2: \"virginica\"}\n    )\n    del df[\"target\"]\n    return df\n\n\ndef df_iris_in_meter_study1() -> pd.DataFrame:\n    \"\"\"The iris collection with lengths in meter.\"\"\"\n    df_iris = df_iris_in_meter()\n    return df_iris.iloc[: len(df_iris) // 2]\n\n\ndef df_iris_in_meter_study2() -> pd.DataFrame:\n    \"\"\"The iris collection with lengths in meter.\"\"\"\n    df_iris = df_iris_in_meter()\n    return df_iris.iloc[len(df_iris) // 2 :]\n\n\ndef dir_scrnaseq_cellranger(\n    sample_name: str, basedir: str | Path = \"./\", output_only: bool = True\n) -> Path:\n    \"\"\"Mock cell ranger outputs.\n\n    Args:\n        sample_name: name of the sample\n        basedir: run directory\n        output_only: only return output files\n    \"\"\"\n    basedir = Path(basedir)\n\n    if not output_only:\n        fastqdir = basedir / \"fastq\"\n        fastqdir.mkdir(parents=True, exist_ok=True)\n        fastqfile1 = fastqdir / f\"{sample_name}_R1_001.fastq.gz\"\n        with open(fastqfile1, \"w\") as f:\n            f.write(f\"{base62(n_char=6)}\")\n        fastqfile2 = fastqdir / f\"{sample_name}_R2_001.fastq.gz\"\n        fastqfile2.touch(exist_ok=True)\n        with open(fastqfile2, \"w\") as f:\n            f.write(f\"{base62(n_char=6)}\")\n\n    sampledir = basedir / f\"{sample_name}\"\n    for folder in [\"raw_feature_bc_matrix\", \"filtered_feature_bc_matrix\", \"analysis\"]:\n        filedir = sampledir / folder\n        filedir.mkdir(parents=True, exist_ok=True)\n\n    for filename in [\n        \"web_summary.html\",\n        \"metrics_summary.csv\",\n        \"possorted_genome_bam.bam\",\n        \"possorted_genome_bam.bam.bai\",\n        \"molecule_info.h5\",\n        \"cloupe.cloupe\",\n        \"raw_feature_bc_matrix.h5\",\n        \"raw_feature_bc_matrix/barcodes.tsv.gz\",\n        \"raw_feature_bc_matrix/features.tsv.gz\",\n        \"raw_feature_bc_matrix/matrix.mtx.gz\",\n        \"filtered_feature_bc_matrix.h5\",\n        \"filtered_feature_bc_matrix/barcodes.tsv.gz\",\n        \"filtered_feature_bc_matrix/features.tsv.gz\",\n        \"filtered_feature_bc_matrix/matrix.mtx.gz\",\n        \"analysis/analysis.csv\",\n    ]:\n        file = sampledir / filename\n        with open(file, \"w\") as f:\n            f.write(f\"{base62(n_char=6)}\")\n\n    return sampledir\n\n\ndef schmidt22_crispra_gws_IFNG(basedir=\".\") -> Path:\n    \"\"\"CRISPRi screen collection of Schmidt22.\n\n    Originally from: https://zenodo.org/record/5784651\n    \"\"\"\n    filepath, _ = urlretrieve(\n        \"https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv\",\n        \"schmidt22-crispra-gws-IFNG.csv\",\n    )\n    return Path(filepath).rename(Path(basedir) / filepath)\n\n\ndef schmidt22_perturbseq(basedir=\".\") -> Path:\n    \"\"\"Perturb-seq collection of Schmidt22.\n\n    Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651\n\n    To reproduce the subsample:\n    >>> adata = sc.read('HuTcellsCRISPRaPerturbSeq_Re-stimulated.h5ad')\n    >>> adata.obs = adata.obs[['cluster_name']]\n    >>> del adata.obsp\n    >>> del adata.var['features']\n    >>> del adata.obsm['X_pca']\n    >>> del adata.uns\n    >>> del adata.raw\n    >>> del adata.varm\n    >>> adata.obs = adata.obs.reset_index()\n    >>> del adata.obs['index']\n    >>> sc.pp.subsample(adata, 0.03)\n    >>> adata.write('schmidt22_perturbseq.h5ad')\n    \"\"\"\n    filepath, _ = urlretrieve(\n        \"https://lamindb-test.s3.amazonaws.com/schmidt22_perturbseq.h5ad\",\n        \"schmidt22_perturbseq.h5ad\",\n    )\n    return Path(filepath).rename(Path(basedir) / filepath)\n\n\ndef anndata_visium_mouse_cellxgene() -> ad.AnnData:\n    \"\"\"Visium samples of thymus from wild type B6 mice 3-6 weeks old.\n\n    The dataset is a CELLxGENE schema 7.0.0 validated dataset.\n    \"\"\"\n    filepath, _ = urlretrieve(\n        \"https://datasets.cellxgene.cziscience.com/74f5c380-081f-41e4-9f05-346831fb67e8.h5ad\",\n        \"zhang_2024_pcw56_visium.h5ad\",\n    )\n    return ad.read_h5ad(filepath)\n\n\ndef spatialdata_blobs() -> SpatialData:\n    \"\"\"Example SpatialData dataset for tutorials.\"\"\"\n    from spatialdata.datasets import blobs\n\n    sdata = blobs()\n    sdata.attrs[\"bio\"] = {\n        \"disease\": \"Alzheimer disease\",\n        \"developmental_stage\": \"adult stage\",\n    }\n    sdata.attrs[\"tech\"] = {\n        \"assay\": \"Visium Spatial Gene Expression\",\n    }\n    sdata.attrs[\"random_int\"] = 20\n    sdata.tables[\"table\"].var.index = [\n        \"ENSG00000139618\",  # BRCA2\n        \"ENSG00000157764\",  # BRAF\n        \"ENSG00000999999\",  # Does not exist\n    ]\n    sdata.tables[\"table\"].obs[\"sample_region\"] = pd.Categorical(\n        [\"sample region 1\"] * 13 + [\"sample region 2\"] * 13\n    )\n\n    return sdata\n"
  },
  {
    "path": "lamindb/examples/datasets/_fake.py",
    "content": "from __future__ import annotations\n\n\ndef fake_bio_notebook_titles(n=100) -> list[str]:\n    \"\"\"A fake collection of study titles.\"\"\"\n    from faker import Faker\n\n    fake = Faker()\n\n    from faker_biology.mol_biol import Antibody\n    from faker_biology.physiology import CellType, Organ, Organelle\n\n    fake.add_provider(CellType)\n    fake.add_provider(Organ)\n    fake.add_provider(Organelle)\n    fake.add_provider(Antibody)\n\n    my_words = [\n        \"study\",\n        \"investigate\",\n        \"research\",\n        \"result\",\n        \"cluster\",\n        \"rank\",\n        \"candidate\",\n        \"visualize\",\n        \"efficiency\",\n        \"classify\",\n    ]\n    my_words += [fake.organ() for i in range(5)] + [\"intestine\", \"intestinal\"]\n    my_words += [fake.celltype() for i in range(10)]\n    my_words += [fake.antibody_isotype() for i in range(20)]\n\n    my_notebook_titles = [fake.sentence(ext_word_list=my_words) for i in range(n)]\n\n    return my_notebook_titles\n"
  },
  {
    "path": "lamindb/examples/datasets/_small.py",
    "content": "from __future__ import annotations\n\nfrom typing import Any, Literal\n\nimport anndata as ad\nimport numpy as np\nimport pandas as pd\n\n\ndef small_dataset3_cellxgene(\n    otype: Literal[\"DataFrame\", \"AnnData\"] = \"AnnData\",\n    *,\n    with_obs_defaults: bool = False,\n    with_var_typo: bool = False,\n    with_obs_typo: bool = False,\n    with_uns_organism: bool = False,\n    with_uns_spatial: bool = False,\n) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:\n    var_id = \"invalid_ensembl_id\" if with_var_typo else \"ENSG00000000457\"\n    var_ids = [var_id, \"ENSG00000000419\", \"ENSG00000139618\"]\n    lung_id = \"UBERON:0002048XXX\" if with_obs_typo else \"UBERON:0002048\"\n\n    obs_data = {\n        \"disease_ontology_term_id\": [\n            \"MONDO:0004975\",\n            \"MONDO:0004980\",\n            \"MONDO:0004980\",\n        ],\n        \"development_stage_ontology_term_id\": [\"unknown\", \"unknown\", \"unknown\"],\n        \"sex_ontology_term_id\": [\"PATO:0000383\", \"PATO:0000384\", \"unknown\"],\n        \"tissue_ontology_term_id\": [lung_id, lung_id, \"UBERON:0000948\"],\n        \"cell_type\": [\"T cell\", \"B cell\", \"B cell\"],\n        \"self_reported_ethnicity\": [\"South Asian\", \"South Asian\", \"South Asian\"],\n        \"donor_id\": [\"-1\", \"1\", \"2\"],\n        \"is_primary_data\": [False, False, False],\n        \"suspension_type\": [\"cell\", \"cell\", \"cell\"],\n        \"tissue_type\": [\"tissue\", \"tissue\", \"tissue\"],\n    }\n\n    obs_df = pd.DataFrame(\n        obs_data,\n        index=[\"barcode1\", \"barcode2\", \"barcode3\"],\n    )\n\n    var_df = pd.DataFrame(\n        index=var_ids, data={\"feature_is_filtered\": [False, False, False]}\n    )\n\n    X = pd.DataFrame(\n        {\n            var_ids[0]: [2, 3, 3],\n            var_ids[1]: [3, 4, 5],\n            var_ids[2]: [4, 2, 3],\n        },\n        index=[\"barcode1\", \"barcode2\", \"barcode3\"],\n        dtype=\"float32\",\n    )\n\n    obs_df[\"donor_id\"] = obs_df[\"donor_id\"].astype(\"category\")\n\n    if otype == \"DataFrame\":\n        return pd.concat([X, obs_df], axis=1)\n    else:\n        adata = ad.AnnData(X=X, obs=obs_df, var=var_df)\n        adata.uns[\"title\"] = \"CELLxGENE example\"\n        adata.obsm[\"X_pca\"] = np.array(\n            [[-1.2, 0.8], [0.5, -0.3], [0.7, -0.5]], dtype=\"float32\"\n        )\n        # CELLxGENE requires the `.raw` slot to be set - https://github.com/chanzuckerberg/single-cell-curation/issues/1304\n        adata.raw = adata.copy()\n        adata.raw.var.drop(columns=\"feature_is_filtered\", inplace=True)\n\n        if with_obs_defaults:\n            adata.obs[\"cell_type_ontology_term_id\"] = [\n                \"CL:0000084\",\n                \"CL:0000236\",\n                \"CL:0000236\",\n            ]\n            adata.obs[\"self_reported_ethnicity_ontology_term_id\"] = \"na\"\n            adata.obs[\"assay_ontology_term_id\"] = \"EFO:1001982\"\n            adata.obs[\"assay\"] = \"single-cell RNA sequencing\"\n        if with_uns_organism:\n            adata.uns[\"organism_ontology_term_id\"] = \"NCBITaxon:9606\"\n            adata.uns[\"organism\"] = \"Homo sapiens\"\n        else:\n            adata.obs[\"organism_ontology_term_id\"] = \"NCBITaxon:9606\"\n            obs_data[\"organism\"] = [\"Homo sapiens\", \"Homo sapiens\", \"Homo sapiens\"]\n        if with_uns_spatial:\n            adata.uns[\"spatial\"] = {\n                \"is_single\": True,\n                \"library_123\": {\n                    \"scalefactors\": {\n                        \"spot_diameter_fullres\": 165.0,\n                        \"tissue_hires_scalef\": 0.5,\n                    },\n                    \"images\": {\n                        \"hires\": np.random.default_rng().integers(\n                            0, 255, (2000, 2000, 3), dtype=np.uint8\n                        )\n                    },\n                },\n            }\n\n        return adata\n\n\ndef anndata_with_obs() -> ad.AnnData:\n    \"\"\"Create a mini anndata with cell_type, disease and tissue.\"\"\"\n    import anndata as ad\n    import bionty.base as bionty_base\n\n    celltypes = [\"T cell\", \"hematopoietic stem cell\", \"hepatocyte\", \"my new cell type\"]\n    celltype_ids = [\"CL:0000084\", \"CL:0000037\", \"CL:0000182\", \"\"]\n    diseases = [\n        \"chronic kidney disease\",\n        \"liver lymphoma\",\n        \"cardiac ventricle disorder\",\n        \"Alzheimer disease\",\n    ]\n    tissues = [\"kidney\", \"liver\", \"heart\", \"brain\"]\n    df = pd.DataFrame()\n    df[\"cell_type\"] = celltypes * 10\n    df[\"cell_type_id\"] = celltype_ids * 10\n    df[\"tissue\"] = tissues * 10\n    df[\"disease\"] = diseases * 10\n    df.index = \"obs\" + df.index.astype(str)\n\n    adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df)\n    bionty_genes = bionty_base.Gene()\n    # backwards compatible\n    adata.var.index = (\n        (\n            bionty_genes.to_dataframe()\n            if hasattr(bionty_genes, \"to_dataframe\")\n            else bionty_genes.df()\n        )\n        .head(100)[\"ensembl_gene_id\"]\n        .values\n    )\n\n    return adata\n"
  },
  {
    "path": "lamindb/examples/datasets/define_mini_immuno_features_labels.py",
    "content": "import bionty as bt\n\nimport lamindb as ln\n\n# define valid labels\nperturbation_type = ln.Record(name=\"Perturbation\", is_type=True).save()\nln.Record(name=\"DMSO\", type=perturbation_type).save()\nln.Record(name=\"IFNG\", type=perturbation_type).save()\nbt.CellType.from_source(name=\"B cell\").save()\nbt.CellType.from_source(name=\"T cell\").save()\n\n# define valid features\nln.Feature(name=\"perturbation\", dtype=perturbation_type).save()\nln.Feature(name=\"cell_type_by_expert\", dtype=bt.CellType).save()\nln.Feature(name=\"cell_type_by_model\", dtype=bt.CellType).save()\nln.Feature(name=\"assay_oid\", dtype=bt.ExperimentalFactor.ontology_id).save()\nln.Feature(name=\"concentration\", dtype=str).save()\nln.Feature(name=\"treatment_time_h\", dtype=\"num\", coerce=True).save()\nln.Feature(name=\"donor\", dtype=str, nullable=True).save()\nln.Feature(name=\"donor_ethnicity\", dtype=list[bt.Ethnicity]).save()\n"
  },
  {
    "path": "lamindb/examples/datasets/define_mini_immuno_schema_flexible.py",
    "content": "import lamindb as ln\n\nschema = ln.Schema(\n    name=\"Mini immuno schema\",\n    features=[\n        ln.Feature.get(name=\"perturbation\"),\n        ln.Feature.get(name=\"cell_type_by_model\"),\n        ln.Feature.get(name=\"assay_oid\"),\n        ln.Feature.get(name=\"donor\"),\n        ln.Feature.get(name=\"concentration\"),\n        ln.Feature.get(name=\"treatment_time_h\"),\n    ],\n    flexible=True,  # _additional_ columns in a dataframe are validated & annotated\n).save()\n"
  },
  {
    "path": "lamindb/examples/datasets/mini_immuno.py",
    "content": "\"\"\"Two \"mini immuno\" datasets.\n\nDatasets\n--------\n\n.. autofunction:: get_dataset1\n.. autofunction:: get_dataset2\n\nSchemas\n-------\n\n.. autofunction:: define_features_labels\n.. autofunction:: define_mini_immuno_schema_flexible\n\nUtilities\n---------\n\n.. autofunction:: save_mini_immuno_datasets\n\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom datetime import date\nfrom typing import TYPE_CHECKING, Literal\n\nimport anndata as ad\nimport pandas as pd\n\nif TYPE_CHECKING:\n    from lamindb.models import Schema\n\n\ndef define_features_labels() -> None:\n    \"\"\"Features & labels to validate the mini immuno datasets.\n\n    .. literalinclude:: scripts/define_mini_immuno_features_labels.py\n        :language: python\n    \"\"\"\n    from . import define_mini_immuno_features_labels  # noqa\n\n\ndef define_mini_immuno_schema_flexible() -> Schema:\n    \"\"\"Features & labels to validate the mini immuno datasets.\n\n    .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py\n        :language: python\n    \"\"\"\n    from lamindb.models import Schema\n\n    define_features_labels()\n    from . import define_mini_immuno_schema_flexible  # noqa\n\n    return Schema.get(name=\"Mini immuno schema\")\n\n\ndef save_mini_immuno_datasets():\n    \"\"\"Save the two \"mini immuno\" datasets.\n\n    .. literalinclude:: scripts/save_mini_immuno_datasets.py\n        :language: python\n    \"\"\"\n    from . import save_mini_immuno_datasets  # noqa\n\n\ndef get_dataset1(\n    otype: Literal[\"DataFrame\", \"AnnData\"] = \"DataFrame\",\n    gene_symbols_in_index: bool = False,\n    with_typo: bool = False,\n    with_cell_type_synonym: bool = False,\n    with_cell_type_typo: bool = False,\n    with_gene_typo: bool = False,\n    with_outdated_gene: bool = False,\n    with_wrong_subtype: bool = False,\n    with_index_type_mismatch: bool = False,\n    with_date_as_iso_string: bool = True,\n) -> pd.DataFrame | ad.AnnData:\n    \"\"\"A small tabular dataset measuring expression & metadata.\"\"\"\n    # define the data in the dataset\n    # it's a mix of numerical measurements and observation-level metadata\n    ifng = \"IFNJ\" if with_typo else \"IFNG\"\n    thing = \"ulabel_but_not_perturbation\" if with_wrong_subtype else \"DMSO\"\n    if gene_symbols_in_index:\n        var_ids = [\"CD8A\", \"CD4\", \"CD14\" if not with_gene_typo else \"GeneTypo\"]\n    else:\n        var_ids = [\n            \"ENSG00000153563\",\n            \"ENSG00000010610\",\n            \"ENSG00000170458\"\n            if not with_gene_typo\n            else \"GeneTypo\"\n            if not with_outdated_gene\n            else \"ENSG00000278198\",\n        ]\n    abt_cell = (\n        \"CD8-pos alpha-beta T cell\"\n        if with_cell_type_typo\n        else \"CD8-positive, alpha-beta T cell\"\n    )\n    dataset_dict = {\n        var_ids[0]: [1, 2, 3],\n        var_ids[1]: [3, 4, 5],\n        var_ids[2]: [5, 6, 7],\n        \"perturbation\": pd.Categorical([\"DMSO\", ifng, thing]),\n        \"sample_note\": [\"was ok\", \"looks naah\", \"pretty! 🤩\"],\n        \"cell_type_by_expert\": pd.Categorical(\n            [\"B-cell\" if with_cell_type_synonym else \"B cell\", abt_cell, abt_cell]\n        ),\n        \"cell_type_by_model\": pd.Categorical([\"B cell\", \"T cell\", \"T cell\"]),\n        \"assay_oid\": pd.Categorical([\"EFO:0008913\", \"EFO:0008913\", \"EFO:0008913\"]),\n        \"concentration\": [\"0.1%\", \"200 nM\", \"0.1%\"],\n        \"treatment_time_h\": [24, 24, 6],\n        \"donor\": [\"D0001\", \"D0002\", None],\n        \"donor_ethnicity\": [\n            [\"Chinese\", \"Singaporean Chinese\"],\n            [\"Chinese\", \"Han Chinese\"],\n            [\"Chinese\"],\n        ],\n    }\n    # define the dataset-level metadata\n    metadata = {\n        \"temperature\": 21.6,\n        \"experiment\": \"Experiment 1\",\n        \"date_of_study\": \"2024-12-01\" if with_date_as_iso_string else date(2024, 12, 1),\n        \"study_note\": \"We had a great time performing this study and the results look compelling.\",\n    }\n    # the dataset as DataFrame\n    dataset_df = pd.DataFrame(\n        dataset_dict,\n        index=[\"sample1\", \"sample2\", 0]  # type: ignore\n        if with_index_type_mismatch\n        else [\"sample1\", \"sample2\", \"sample3\"],\n    )\n    if otype == \"DataFrame\":\n        for key, value in metadata.items():\n            dataset_df.attrs[key] = value\n        return dataset_df\n    else:\n        del dataset_df[\n            \"donor_ethnicity\"\n        ]  # remove the donor_ethnicity because AnnData save will error\n        dataset_ad = ad.AnnData(\n            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata\n        )\n        return dataset_ad\n\n\ndef get_dataset2(\n    otype: Literal[\"DataFrame\", \"AnnData\"] = \"DataFrame\",\n    gene_symbols_in_index: bool = False,\n    with_date_as_iso_string: bool = True,\n) -> pd.DataFrame | ad.AnnData:\n    \"\"\"A second small tabular dataset measuring expression & metadata.\"\"\"\n    if gene_symbols_in_index:\n        var_ids = [\"CD8A\", \"CD4\", \"CD38\"]\n    else:\n        var_ids = [\"ENSG00000153563\", \"ENSG00000010610\", \"ENSG00000004468\"]\n    dataset_dict = {\n        var_ids[0]: [2, 3, 3],\n        var_ids[1]: [3, 4, 5],\n        var_ids[2]: [4, 2, 3],\n        \"perturbation\": pd.Categorical([\"DMSO\", \"IFNG\", \"IFNG\"]),\n        \"cell_type_by_model\": pd.Categorical([\"B cell\", \"T cell\", \"T cell\"]),\n        \"concentration\": [\"0.1%\", \"200 nM\", \"0.1%\"],\n        \"treatment_time_h\": [24, 24, 6],\n        \"donor\": [\"D0003\", \"D0003\", \"D0004\"],\n    }\n    metadata = {\n        \"temperature\": 22.6,\n        \"experiment\": \"Experiment 2\",\n        \"date_of_study\": \"2025-02-13\" if with_date_as_iso_string else date(2025, 2, 13),\n    }\n    dataset_df = pd.DataFrame(\n        dataset_dict,\n        index=[\"sample4\", \"sample5\", \"sample6\"],\n    )\n    ad.AnnData(\n        dataset_df[var_ids],\n        obs=dataset_df[[\"perturbation\", \"cell_type_by_model\"]],\n    )\n    if otype == \"DataFrame\":\n        for key, value in metadata.items():\n            dataset_df.attrs[key] = value\n        return dataset_df\n    else:\n        dataset_ad = ad.AnnData(\n            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata\n        )\n        return dataset_ad\n"
  },
  {
    "path": "lamindb/examples/datasets/save_mini_immuno_datasets.py",
    "content": "from datetime import date\n\nimport bionty as bt\n\nimport lamindb as ln\n\n## define valid labels\nln.Record.from_values([\"DMSO\", \"IFNG\"], create=True).save()\nln.Record.from_values([\"Experiment 1\", \"Experiment 2\"], create=True).save()\nbt.CellType.from_values([\"B cell\", \"T cell\"]).save()\n\n# observation-level metadata\nln.Feature(name=\"perturbation\", dtype=ln.Record).save()\nln.Feature(name=\"sample_note\", dtype=str).save()\nln.Feature(name=\"cell_type_by_expert\", dtype=bt.CellType).save()\nln.Feature(name=\"cell_type_by_model\", dtype=bt.CellType).save()\n# dataset-level metadata\nln.Feature(name=\"temperature\", dtype=float).save()\nln.Feature(name=\"experiment\", dtype=ln.Record).save()\nln.Feature(name=\"date_of_study\", dtype=date, coerce=True).save()\nln.Feature(name=\"study_note\", dtype=str).save()\nln.Feature(name=\"study_metadata\", dtype=dict).save()\n\nschema = ln.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs()\n\n## Ingest dataset1\nadata = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\nartifact = ln.Artifact.from_anndata(\n    adata,\n    key=\"examples/dataset1.h5ad\",\n    schema=schema,\n).save()\nadhoc = {\"study_metadata\": {\"detail1\": \"123\", \"detail2\": 1}}\ndataset_metadata = adata.uns\ndataset_metadata.update(adhoc)\nartifact.features.add_values(dataset_metadata)  # type: ignore\n\n# Ingest dataset2\nadata2 = ln.examples.datasets.mini_immuno.get_dataset2(otype=\"AnnData\")\nartifact2 = ln.Artifact.from_anndata(\n    adata2,\n    key=\"examples/dataset2.h5ad\",\n    schema=schema,\n).save()\nadhoc2 = {\"study_metadata\": {\"detail1\": \"456\", \"detail2\": 2}}\ndataset_metadata2 = adata2.uns\ndataset_metadata2.update(adhoc2)\nartifact2.features.add_values(dataset_metadata2)  # type: ignore\n"
  },
  {
    "path": "lamindb/examples/fixtures/__init__.py",
    "content": ""
  },
  {
    "path": "lamindb/examples/fixtures/sheets.py",
    "content": "import bionty as bt\nimport pandas as pd\nimport pytest\n\nimport lamindb as ln\n\n\n@pytest.fixture(scope=\"module\")\ndef populate_sheets_compound_treatment():\n    # Compounds ---------------------------\n\n    compound_type = ln.Record(name=\"Compound\", is_type=True).save()\n\n    # features for compounds\n    structure = ln.Feature(name=\"structure\", dtype=\"str\").save()\n\n    # drug1\n    drug1 = ln.Record(name=\"drug1\", type=compound_type).save()\n    ln.models.RecordJson(record=drug1, feature=structure, value=\"12345\").save()\n    # drug2\n    drug2 = ln.Record(name=\"drug2\", type=compound_type).save()\n    ln.models.RecordJson(record=drug2, feature=structure, value=\"45678\").save()\n\n    # Treatments ---------------------------\n\n    treatment_type = ln.Record(name=\"Treatment\", is_type=True).save()\n\n    # features for treatments\n    compound = ln.Feature(name=\"compound\", dtype=compound_type).save()\n    concentration = ln.Feature(name=\"concentration\", dtype=\"num\").save()\n    # a sheet for treatments\n    treatments_sheet = ln.Record(\n        name=\"My treatments 2025-05\", type=treatment_type, is_type=True\n    ).save()  # sheet without validating schema\n\n    # populate treatment1\n    treatment1 = ln.Record(name=\"treatment1\", type=treatments_sheet).save()\n    ln.models.RecordRecord(record=treatment1, feature=compound, value=drug1).save()\n    assert drug1 in treatment1.linked_records.all()\n    assert treatment1 in drug1.linked_in_records.all()\n    ln.models.RecordJson(record=treatment1, feature=concentration, value=\"2nM\").save()\n    # populate treatment2\n    treatment2 = ln.Record(name=\"treatment2\", type=treatments_sheet).save()\n    ln.models.RecordRecord(record=treatment2, feature=compound, value=drug2).save()\n    ln.models.RecordJson(record=treatment2, feature=concentration, value=\"4nM\").save()\n\n    # Samples ---------------------------\n\n    # features named id, uid or name conflict with django field names, we test them here\n    id_feature = ln.Feature(name=\"id\", dtype=int).save()\n    uid_feature = ln.Feature(name=\"uid\", dtype=str).save()\n    name_feature = ln.Feature(name=\"name\", dtype=str).save()\n\n    project = ln.Feature(name=\"project\", dtype=ln.Project).save()\n    project1 = ln.Project(name=\"Project 1\").save()\n    sample_type = ln.Record(name=\"BioSample\", is_type=True).save()\n    treatment = ln.Feature(name=\"treatment\", dtype=treatment_type).save()\n    cell_line = ln.Feature(name=\"cell_line\", dtype=bt.CellLine).save()\n    preparation_date = ln.Feature(name=\"preparation_date\", dtype=\"datetime\").save()\n    cell_line._dtype_str = (\n        \"cat[bionty.CellLine]\"  # might have previously been set to \"cat\"\n    )\n    cell_line.save()\n    sample_schema1 = ln.Schema(\n        name=\"My samples schema 2025-06\",\n        features=[\n            id_feature,\n            uid_feature,\n            name_feature,\n            treatment,\n            cell_line,\n            preparation_date,\n            project,\n        ],\n    ).save()\n    sample_sheet1 = ln.Record(\n        name=\"My samples 2025-06\", schema=sample_schema1, type=sample_type\n    ).save()\n    # values for cell lines\n    hek293t = bt.CellLine.from_source(\"HEK293T\").save()\n\n    # populate sample1\n    sample1 = ln.Record(name=\"sample1\", type=sample_sheet1).save()\n    ln.models.RecordJson(record=sample1, feature=id_feature, value=1).save()\n    ln.models.RecordJson(record=sample1, feature=uid_feature, value=\"S1\").save()\n    ln.models.RecordJson(record=sample1, feature=name_feature, value=\"Sample 1\").save()\n    ln.models.RecordRecord(record=sample1, feature=treatment, value=treatment1).save()\n    bt.models.RecordCellLine(record=sample1, feature=cell_line, value=hek293t).save()\n    ln.models.RecordJson(\n        record=sample1, feature=preparation_date, value=\"2025-06-01T05:00:00\"\n    ).save()\n    ln.models.RecordProject(record=sample1, feature=project, value=project1).save()\n    # populate sample2\n    sample2 = ln.Record(name=\"sample2\", type=sample_sheet1).save()\n    ln.models.RecordJson(record=sample2, feature=id_feature, value=2).save()\n    ln.models.RecordJson(record=sample2, feature=uid_feature, value=\"S2\").save()\n    ln.models.RecordJson(record=sample2, feature=name_feature, value=\"Sample 2\").save()\n    ln.models.RecordRecord(record=sample2, feature=treatment, value=treatment2).save()\n    bt.models.RecordCellLine(record=sample2, feature=cell_line, value=hek293t).save()\n    ln.models.RecordJson(\n        record=sample2, feature=preparation_date, value=\"2025-06-01T06:00:00\"\n    ).save()\n    ln.models.RecordProject(record=sample2, feature=project, value=project1).save()\n\n    # another sheet for samples\n    sample_note = ln.Feature(name=\"sample_note\", dtype=\"str\").save()\n    sample_schema2 = ln.Schema(\n        name=\"My samples schema 2025-07\",\n        features=[treatment, cell_line, sample_note, project],\n    ).save()\n    # the sheet\n    sample_sheet2 = ln.Record(\n        name=\"My samples 2025-07\", schema=sample_schema2, type=sample_type\n    ).save()\n    # populate sample3\n    sample3 = ln.Record(type=sample_sheet2).save()  # no name\n    ln.models.RecordRecord(record=sample3, feature=treatment, value=treatment1).save()\n    bt.models.RecordCellLine(record=sample3, feature=cell_line, value=hek293t).save()\n    ln.models.RecordJson(\n        record=sample3, feature=preparation_date, value=\"2025-06-02T05:00:00Z\"\n    ).save()\n    ln.models.RecordProject(record=sample3, feature=project, value=project1).save()\n    # populate sample4\n    sample4 = ln.Record(type=sample_sheet2).save()\n    ln.models.RecordRecord(record=sample4, feature=treatment, value=treatment2).save()\n    bt.models.RecordCellLine(record=sample4, feature=cell_line, value=hek293t).save()\n    ln.models.RecordJson(\n        record=sample4, feature=preparation_date, value=\"2025-06-02T06:00:00Z\"\n    ).save()\n    ln.models.RecordProject(record=sample4, feature=project, value=project1).save()\n\n    yield treatments_sheet, sample_sheet1\n\n    sample4.delete(permanent=True)\n    sample3.delete(permanent=True)\n    sample_sheet2.delete(permanent=True)\n    sample_schema2.delete(permanent=True)\n    sample_note.delete(permanent=True)\n    sample2.delete(permanent=True)\n    sample1.delete(permanent=True)\n    # hek293t.delete(permanent=True)  # not for now\n    sample_sheet1.delete(permanent=True)\n    sample_schema1.delete(permanent=True)\n    preparation_date.delete(permanent=True)\n    cell_line.delete(permanent=True)\n    # sample_type.delete(permanent=True)   # not for now\n    treatment2.delete(permanent=True)\n    treatment1.delete(permanent=True)\n    treatments_sheet.delete(permanent=True)\n    treatment_type.delete(permanent=True)\n    concentration.delete(permanent=True)\n    drug2.delete(permanent=True)\n    drug1.delete(permanent=True)\n    structure.delete(permanent=True)\n    compound.delete(permanent=True)\n    compound_type.delete(permanent=True)\n\n\n@pytest.fixture(scope=\"module\")\ndef populate_nextflow_sheet_with_samples():\n    # Biosample schema and type\n    samples_schema = ln.Schema(\n        name=\"Biosample test schema\",\n        features=[\n            ln.Feature(name=\"species\", dtype=\"cat[bionty.Organism]\").save(),\n            ln.Feature(name=\"cell_type\", dtype=\"cat[bionty.CellType]\").save(),\n            ln.Feature(name=\"tissue\", dtype=\"cat[bionty.Tissue]\").save(),\n        ],\n    ).save()\n\n    biosample_type = ln.Record(name=\"BioSample\", is_type=True).save()\n\n    # Biosamples sheet\n    samples_sheet = ln.Record(\n        name=\"My samples 2025-04\", schema=samples_schema, type=biosample_type\n    ).save()\n    sample_x = ln.Record(name=\"Sample_X\", type=samples_sheet).save()\n    sample_y = ln.Record(name=\"Sample_Y\", type=samples_sheet).save()\n\n    organism_human = bt.Organism.from_source(name=\"human\").save()\n    celltype_tcell = bt.CellType.from_source(name=\"T cell\").save()\n    tissue_blood = bt.Tissue.from_source(name=\"blood\").save()\n\n    features = ln.Feature.lookup()\n    for sample in [sample_x, sample_y]:\n        bt.models.RecordOrganism(\n            record=sample, feature=features.species, value=organism_human\n        ).save()\n        bt.models.RecordCellType(\n            record=sample, feature=features.cell_type, value=celltype_tcell\n        ).save()\n        bt.models.RecordTissue(\n            record=sample, feature=features.tissue, value=tissue_blood\n        ).save()\n\n    # Nextflow samplesheet schema\n    nextflow_schema = ln.Schema(\n        name=\"RNA-seq standard\",\n        features=[\n            ln.Feature(name=\"sample\", dtype=biosample_type).save(),\n            ln.Feature(name=\"fastq_1\", dtype=str).save(),\n            ln.Feature(name=\"fastq_2\", dtype=str).save(),\n            ln.Feature(name=\"expected_cells\", dtype=int).save(),\n            ln.Feature(name=\"seq_center\", dtype=str).save().with_config(optional=True),\n        ],\n        ordered_set=True,\n    ).save()\n\n    nextflowsample_type = ln.Record(name=\"NextflowSample\", is_type=True).save()\n    nextflow_sheet = ln.Record(\n        schema=nextflow_schema,\n        name=\"RNA-seq nextflow samplesheet 001\",\n        type=nextflowsample_type,\n        is_type=True,\n    ).save()\n\n    sample_data = {\n        \"sample\": [\"Sample_X\", \"Sample_Y\", \"Sample_Y\"],\n        \"fastq_1\": [\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz\",\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R1_001.fastq.gz\",\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R1_001.fastq.gz\",\n        ],\n        \"fastq_2\": [\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz\",\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R2_001.fastq.gz\",\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R2_001.fastq.gz\",\n        ],\n        \"expected_cells\": [5000, 5000, 5000],\n    }\n    df = pd.DataFrame(sample_data)\n\n    features = ln.Feature.lookup()\n    nextflow_samples = []\n    for _, row in df.iterrows():\n        sample = ln.Record(type=nextflow_sheet).save()\n        nextflow_samples.append(sample)\n        ln.models.RecordRecord(\n            record=sample,\n            feature=features.sample,\n            value=ln.Record.get(name=row[\"sample\"]),\n        ).save()\n        ln.models.RecordJson(\n            record=sample, feature=features.fastq_1, value=row[\"fastq_1\"]\n        ).save()\n        ln.models.RecordJson(\n            record=sample, feature=features.fastq_2, value=row[\"fastq_2\"]\n        ).save()\n        ln.models.RecordJson(\n            record=sample, feature=features.expected_cells, value=row[\"expected_cells\"]\n        ).save()\n\n    yield nextflow_sheet\n\n    # Delete in reverse order of creation\n    # Delete nextflow samples\n    for sample in reversed(nextflow_samples):\n        sample.delete(permanent=True)\n\n    # Delete nextflow sheet and schema\n    nextflow_sheet.delete(permanent=True)\n    nextflowsample_type.delete(permanent=True)\n    nextflow_schema.delete(permanent=True)\n\n    # Delete samples sheet and schema\n    samples_sheet.records.all().delete(permanent=True)\n    samples_sheet.delete(permanent=True)\n    # biosample_type.delete(permanent=True)  # not for now (shared with first fixture)\n    samples_schema.delete(permanent=True)\n\n    print(ln.Schema.to_dataframe())\n\n    # Delete nextflow schema features\n    features = ln.Feature.lookup()\n    features.seq_center.delete(permanent=True)\n    features.expected_cells.delete(permanent=True)\n    features.fastq_2.delete(permanent=True)\n    features.fastq_1.delete(permanent=True)\n    features.sample.delete(permanent=True)\n\n    # Delete biosamples\n    sample_y.delete(permanent=True)\n    sample_x.delete(permanent=True)\n\n    # Delete biosample schema features\n    features.tissue.delete(permanent=True)\n    features.cell_type.delete(permanent=True)\n    features.species.delete(permanent=True)\n\n    # Note: organism_human, celltype_tcell, tissue_blood are from bionty\n    # and might be shared, so not deleting them (similar to hek293t in first fixture)\n"
  },
  {
    "path": "lamindb/examples/mlflow/__init__.py",
    "content": "\"\"\"Examples and utilities for Mlflow.\n\n.. autofunction:: save_mlflow_features\n\"\"\"\n\nimport lamindb as ln\n\n\ndef save_mlflow_features():\n    \"\"\"Saves all MLflow experiment and run related features.\n\n    Saves the following features:\n\n    - mlflow_run_id\n    - mlflow_run_name\n    - mlflow_experiment_id\n    - mlflow_experiment_name\n    - mlflow_user_id\n    - mlflow_status\n    - mlflow_lifecycle_stage\n    - mlflow_artifact_uri\n    - mlflow_start_time\n    - mlflow_end_time\n    \"\"\"\n    mlflow_type = ln.Feature(name=\"MLflow\", is_type=True).save()\n    ln.Feature(name=\"mlflow_run_id\", dtype=str, type=mlflow_type).save()\n    ln.Feature(name=\"mlflow_run_name\", dtype=str, type=mlflow_type).save()\n    ln.Feature(name=\"mlflow_experiment_id\", dtype=str, type=mlflow_type).save()\n    ln.Feature(name=\"mlflow_experiment_name\", dtype=str, type=mlflow_type).save()\n    ln.Feature(name=\"mlflow_user_id\", dtype=str, type=mlflow_type).save()\n    ln.Feature(name=\"mlflow_status\", dtype=str, type=mlflow_type).save()\n    ln.Feature(name=\"mlflow_lifecycle_stage\", dtype=str, type=mlflow_type).save()\n    ln.Feature(name=\"mlflow_artifact_uri\", dtype=str, type=mlflow_type).save()\n    ln.Feature(name=\"mlflow_start_time\", dtype=int, type=mlflow_type).save()\n    ln.Feature(name=\"mlflow_end_time\", dtype=int, type=mlflow_type).save()\n"
  },
  {
    "path": "lamindb/examples/schemas/__init__.py",
    "content": "\"\"\"Example schemas.\n\n.. autofunction:: valid_features\n.. autofunction:: anndata_ensembl_gene_ids_and_valid_features_in_obs\n\n\"\"\"\n\nfrom ._anndata import anndata_ensembl_gene_ids_and_valid_features_in_obs\nfrom ._simple import valid_features\n"
  },
  {
    "path": "lamindb/examples/schemas/_anndata.py",
    "content": "from __future__ import annotations\n\nimport importlib\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from ... import Schema\n\n\ndef anndata_ensembl_gene_ids_and_valid_features_in_obs() -> Schema:\n    \"\"\"An `AnnData` schema validating Ensembl gene IDs and valid features in obs.\n\n    .. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py\n        :language: python\n    \"\"\"\n    from ... import Schema\n\n    try:\n        return Schema.get(name=\"anndata_ensembl_gene_ids_and_valid_features_in_obs\")\n    except Schema.DoesNotExist:\n        from . import define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs  # noqa\n\n        try:\n            return Schema.get(name=\"anndata_ensembl_gene_ids_and_valid_features_in_obs\")\n        except Schema.DoesNotExist:\n            importlib.reload(\n                define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs\n            )\n            return Schema.get(name=\"anndata_ensembl_gene_ids_and_valid_features_in_obs\")\n"
  },
  {
    "path": "lamindb/examples/schemas/_simple.py",
    "content": "from __future__ import annotations\n\nimport importlib\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from ... import Schema\n\n\ndef valid_features() -> Schema:\n    \"\"\"A `DataFrame` schema that validates that columns map on existing features.\n\n    .. literalinclude:: scripts/define_valid_features.py\n        :language: python\n    \"\"\"\n    from ... import Schema\n\n    try:\n        return Schema.get(name=\"valid_features\")\n    except Schema.DoesNotExist:\n        try:\n            from . import define_valid_features  # noqa\n\n            return Schema.get(name=\"valid_features\")\n        except Schema.DoesNotExist:\n            importlib.reload(define_valid_features)\n            return Schema.get(name=\"valid_features\")\n"
  },
  {
    "path": "lamindb/examples/schemas/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py",
    "content": "import bionty as bt\n\nimport lamindb as ln\n\nobs_schema = ln.examples.schemas.valid_features()\nvarT_schema = ln.Schema(\n    name=\"valid_ensembl_gene_ids\", itype=bt.Gene.ensembl_gene_id\n).save()\nschema = ln.Schema(\n    name=\"anndata_ensembl_gene_ids_and_valid_features_in_obs\",\n    otype=\"AnnData\",\n    slots={\"obs\": obs_schema, \"var.T\": varT_schema},\n).save()\n"
  },
  {
    "path": "lamindb/examples/schemas/define_valid_features.py",
    "content": "import lamindb as ln\n\nschema = ln.Schema(name=\"valid_features\", itype=ln.Feature).save()\n"
  },
  {
    "path": "lamindb/examples/wandb/__init__.py",
    "content": "\"\"\"Examples and utilities for Weights & Biases.\n\n.. autofunction:: save_wandb_features\n\"\"\"\n\nimport lamindb as ln\n\n\ndef save_wandb_features():\n    \"\"\"Saves all Weights & Biases project and run related features.\n\n    Saves the following features:\n\n    - wandb_run_id\n    - wandb_run_name\n    - wandb_run_entity\n    - wandb_project\n    - wandb_state\n    - wandb_url\n    - wandb_tags\n    - wandb_group\n    - wandb_job_type\n    - timestamp\n    - runtime\n    \"\"\"\n    wandb_type = ln.Feature(name=\"Weights & Biases\", is_type=True).save()\n    ln.Feature(name=\"wandb_run_id\", dtype=str, type=wandb_type).save()\n    ln.Feature(name=\"wandb_run_name\", dtype=str, type=wandb_type).save()\n    ln.Feature(name=\"wandb_run_entity\", dtype=str, type=wandb_type).save()\n    ln.Feature(name=\"wandb_project\", dtype=str, type=wandb_type).save()\n    ln.Feature(name=\"wandb_state\", dtype=str, type=wandb_type).save()\n    ln.Feature(name=\"wandb_url\", dtype=str, type=wandb_type).save()\n    ln.Feature(name=\"wandb_tags\", dtype=str, type=wandb_type).save()\n    ln.Feature(name=\"wandb_group\", dtype=str, type=wandb_type).save()\n    ln.Feature(name=\"wandb_job_type\", dtype=str, type=wandb_type).save()\n    ln.Feature(name=\"wandb_timestamp\", dtype=float, type=wandb_type).save()\n    ln.Feature(name=\"wandb_runtime\", dtype=float, type=wandb_type).save()\n"
  },
  {
    "path": "lamindb/integrations/__init__.py",
    "content": "\"\"\"Integrations.\n\nModules\n-------\n\n.. autosummary::\n   :toctree: .\n\n   lightning\n\nFunctions\n---------\n\n.. autofunction:: save_vitessce_config\n.. autofunction:: save_tiledbsoma_experiment\n.. autofunction:: curate_from_croissant\n\n\"\"\"\n\nfrom ._croissant import curate_from_croissant\nfrom ._vitessce import save_vitessce_config\n\n__all__ = [\n    \"lightning\",\n    \"save_tiledbsoma_experiment\",\n    \"curate_from_croissant\",\n    \"save_vitessce_config\",\n]\n\n\ndef __getattr__(name: str):\n    \"\"\"Lazy-import save_tiledbsoma_experiment to avoid loading storage at package import.\"\"\"\n    if name == \"save_tiledbsoma_experiment\":\n        from lamindb.core.storage import save_tiledbsoma_experiment\n\n        return save_tiledbsoma_experiment\n    raise AttributeError(f\"module {__name__!r} has no attribute {name!r}\")\n"
  },
  {
    "path": "lamindb/integrations/_croissant.py",
    "content": "from __future__ import annotations\n\nimport json\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any\n\nimport lamindb_setup as ln_setup\nfrom lamin_utils import logger\nfrom lamindb_setup.core.upath import UPath\n\nif TYPE_CHECKING:\n    from lamindb_setup.types import AnyPathStr\n\n    import lamindb as ln\n\n\ndef curate_from_croissant(\n    croissant_data: AnyPathStr | dict[str, Any],\n    run: ln.Run | None = None,\n) -> ln.Artifact | ln.Collection:\n    \"\"\"Create annotated artifacts from a CroissantML file.\n\n    Returns a collection if multiple files are found in `croissant_data`, otherwise a single artifact.\n\n    Args:\n        croissant_data: Path to CroissantML JSON file or dictionary.\n\n    Example:\n\n        ::\n\n            artifact = ln.integrations.curate_from_croissant(\"dataset_metadata.json\")\n    \"\"\"\n    import lamindb as ln\n\n    from ..models.artifact import check_path_in_existing_storage\n\n    # Load CroissantML data\n    if isinstance(croissant_data, (str, Path, UPath)):\n        croissant_path = UPath(croissant_data)\n        if not croissant_path.exists():\n            raise FileNotFoundError(f\"File not found: {croissant_data}\")\n        with croissant_path.open(encoding=\"utf-8\") as f:\n            data = json.load(f)\n    elif isinstance(croissant_data, dict):\n        data = croissant_data\n    else:\n        raise ValueError(\n            \"croissant_data must be a file path, JSON string, or dictionary\"\n        )\n\n    # Validate basic structure\n    if data.get(\"@type\") != \"Dataset\":\n        raise ValueError(\"CroissantML @type must be 'Dataset'\")\n\n    if \"name\" not in data:\n        raise ValueError(\"CroissantML must have a 'name' field\")\n\n    # Extract basic metadata\n    dataset_name = data[\"name\"]\n    description = data.get(\"description\", None)\n    version = data.get(\"version\", None)\n    license_info = data.get(\"license\", None)\n    project_name = data.get(\"cr:projectName\", None)\n\n    # Create license feature and label if license info exists\n    license_label = None\n    if license_info:\n        license_label_type = ln.ULabel.filter(name=\"License\", is_type=True).first()\n        if not license_label_type:\n            license_label_type = ln.ULabel(name=\"License\", is_type=True).save()\n        license_label = ln.ULabel.filter(name=license_info).first()\n        if not license_label:\n            license_label = ln.ULabel(\n                name=license_info,\n                description=\"Dataset license\",\n                type=license_label_type,\n            ).save()\n    project_label = None\n    if project_name:\n        project_label = ln.Project.filter(name=project_name).first()\n        if not project_label:\n            project_label = ln.Project(name=project_name).save()\n\n    # Extract file distributions\n    artifacts = []\n    file_distributions = data.get(\"distribution\", [])\n    if not file_distributions:\n        raise ValueError(\"No file distributions found in croissant data\")\n    for dist in file_distributions:\n        file_id = dist.get(\"@id\", \"\")\n        if UPath(file_id).exists():\n            file_path = file_id\n        else:\n            content_url = dist.get(\"contentUrl\", \"\")\n            file_path = content_url or data.get(\"url\", \"\")\n        if not file_path:\n            raise ValueError(f\"No file path found in croissant distribution: {dist}\")\n        if not UPath(file_path).exists():\n            raise ValueError(f\"Inferred file path does not exist: {file_path}\")\n        result = check_path_in_existing_storage(\n            file_path, check_hub_register_storage=ln_setup.settings.instance.is_on_hub\n        )\n        if isinstance(result, ln.Storage):\n            key = None  # will automatically use existing storage key\n        else:\n            current_storage_location = (\n                ln.settings.storage\n                if not ln.setup.settings.instance.keep_artifacts_local\n                else ln.settings.local_storage\n            )\n            logger.warning(\n                f\"file path {file_path} is not part of a known storage location, will be duplicated to: {current_storage_location}\"\n            )\n            key = file_id\n        if len(file_distributions) == 1:\n            # it doesn't make sense to have the dataset name on the individual\n            # artifact if it's part of a collection\n            artifact_description = dataset_name\n            if description is not None:\n                artifact_description += f\" - {description}\"\n        else:\n            artifact_description = None\n        artifact = ln.Artifact(  # type: ignore\n            file_path,\n            key=key,\n            description=artifact_description,\n            version=version,\n            kind=\"dataset\",\n            run=run,\n        ).save()\n        if license_label:\n            artifact.ulabels.add(license_label)\n        if project_label:\n            artifact.projects.add(project_label)\n        artifacts.append(artifact)\n\n    if len(artifacts) == 1:\n        return artifacts[0]\n    else:\n        collection = ln.Collection(  # type: ignore\n            artifacts, key=dataset_name, description=description, version=version\n        ).save()\n        if license_label:\n            collection.ulabels.add(license_label)\n        if project_label:\n            collection.projects.add(project_label)\n        return collection\n"
  },
  {
    "path": "lamindb/integrations/_vitessce.py",
    "content": "from __future__ import annotations\n\nimport json\nfrom datetime import datetime, timezone\nfrom typing import TYPE_CHECKING\n\nimport lamindb_setup as ln_setup\nfrom lamin_utils import logger\n\nfrom lamindb.models.artifact import Artifact\nfrom lamindb.models.collection import Collection\nfrom lamindb.models.run import Run\nfrom lamindb.models.transform import Transform\n\nif TYPE_CHECKING:\n    from vitessce import VitessceConfig\n\n\n# \"unit test\": https://github.com/laminlabs/lamindb/blob/main/docs/storage/vitessce.ipynb\n# integration test & context: https://github.com/laminlabs/lamin-spatial/blob/main/docs/vitessce.ipynb\ndef save_vitessce_config(\n    vitessce_config: VitessceConfig,\n    key: str | None = None,\n    description: str | None = None,\n) -> Artifact:\n    \"\"\"Validates and saves a `VitessceConfig` object.\n\n    If the `VitessceConfig` object references multiple artifacts, automatically\n    creates a `Collection` and displays the \"Vitessce button\" next to it.\n\n    The `VitessceConfig` artifact has `.suffix = \".vitessce.json\"` and `.kind = \"__lamindb_config__\"`,\n    which is by default hidden on the hub UI.\n\n    Guide: :doc:`docs:vitessce`.\n\n    Args:\n        vitessce_config: A `VitessceConfig` object.\n        key: A `key` for the `VitessceConfig` artifact.\n        description: A `description` for the `VitessceConfig` aritifact. Is additionally\n            used as `key` for a `Collection` in case the `VitessceConfig` object\n            references multiple artifacts.\n    \"\"\"\n    # can only import here because vitessce is not a dependency\n    from vitessce import VitessceConfig\n\n    assert isinstance(vitessce_config, VitessceConfig)  # noqa: S101\n    vc_dict = vitessce_config.to_dict()\n    try:\n        url_to_artifact_dict = vitessce_config.get_artifacts()\n    except AttributeError as e:\n        raise SystemExit(\n            \"save_vitessce_config() requires vitessce>=3.4.0: pip install vitessce>=3.4.0\"\n        ) from e\n    dataset_artifacts = list(url_to_artifact_dict.values())\n    message = \"\\n\".join([artifact.__repr__() for artifact in dataset_artifacts])\n    logger.important(f\"VitessceConfig references these artifacts:\\n{message}\")\n    assert len(dataset_artifacts) > 0  # noqa: S101\n\n    # the below will be replaced with a `ln.step()` decorator soon\n    transform = Transform(  # type: ignore\n        uid=\"kup03MJBsIVa0002\",\n        key=\"save_vitessce_config\",\n        type=\"function\",\n        version=\"3\",\n    ).save()\n    run = Run(transform=transform).save()\n    run.input_artifacts.set(dataset_artifacts)\n    collection = None\n    if len(dataset_artifacts) > 1:\n        # if we have more datasets, we should create a collection\n        # and attach an action to the collection\n        # consicious use of description for key, see here\n        # https://github.com/laminlabs/lamindb/pull/2997\n        collection = Collection(dataset_artifacts, key=description).save()\n\n    # create a JSON export\n    config_file_local_path = ln_setup.settings.cache_dir / \"config.vitessce.json\"\n    with open(config_file_local_path, \"w\") as file:\n        json.dump(vc_dict, file)\n    vitessce_config_artifact = Artifact(\n        config_file_local_path,\n        key=key,\n        description=description,\n        run=run,\n        kind=\"__lamindb_config__\",\n    ).save()\n    slug = ln_setup.settings.instance.slug\n    logger.important(\n        f\"VitessceConfig: https://lamin.ai/{slug}/artifact/{vitessce_config_artifact.uid}\"\n    )\n    if collection is None:\n        # we have one and only one dataset artifact, hence the following line is OK\n        dataset_artifacts[0]._actions.add(vitessce_config_artifact)\n        logger.important(\n            f\"Dataset: https://lamin.ai/{slug}/artifact/{dataset_artifacts[0].uid}\"\n        )\n    else:\n        collection._actions.add(vitessce_config_artifact)\n        logger.important(\n            f\"Collection: https://lamin.ai/{slug}/collection/{collection.uid}\"\n        )\n    run.finished_at = datetime.now(timezone.utc)\n    run.save()\n    return vitessce_config_artifact\n"
  },
  {
    "path": "lamindb/integrations/lightning.py",
    "content": "\"\"\"PyTorch Lightning integration for LaminDB.\n\nThe public API has two layers:\n\n- :class:`Checkpoint` is the concrete LaminDB implementation that persists checkpoint, config, and `hparams.yaml` files as :class:`~lamindb.Artifact` objects and annotates them with :class:`~lamindb.Feature` objects.\n- :class:`ArtifactPublishingModelCheckpoint` is the generic extension layer adding checkpoint artifact lifecycle hooks without implementing Lamin persistence details yet.\n\nExternal integrations can either subclass :class:`Checkpoint` directly or attach\nan :class:`ArtifactObserver` to react to saved and removed artifacts.\n\nHere is a guide: :doc:`lightning`.\n\nMain API\n--------\n\n.. autoclass:: Checkpoint\n.. autofunction:: save_lightning_features\n\nAuxiliary classes\n-----------------\n\n.. autoclass:: ArtifactPublishingModelCheckpoint\n.. autoclass:: SaveConfigCallback\n.. autoclass:: ArtifactSavedEvent\n.. autoclass:: ArtifactRemovedEvent\n\"\"\"\n\nfrom __future__ import annotations\n\nimport warnings\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, Final, Literal, Protocol\n\nimport lightning.pytorch as pl\nfrom lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint\nfrom lightning.pytorch.cli import SaveConfigCallback as _SaveConfigCallback\n\nimport lamindb as ln\nfrom lamindb.models.artifact import track_run_input\n\nif TYPE_CHECKING:\n    from datetime import timedelta\n\n    from lightning.fabric.utilities.types import _PATH\n\n\n_RUN_AUTO_FEATURES: Final = frozenset(\n    {\n        \"logger_name\",\n        \"logger_version\",\n        \"max_epochs\",\n        \"max_steps\",\n        \"precision\",\n        \"accumulate_grad_batches\",\n        \"gradient_clip_val\",\n        \"monitor\",\n        \"mode\",\n    }\n)\n_ARTIFACT_AUTO_FEATURES: Final = frozenset(\n    {\n        \"is_best_model\",\n        \"is_last_model\",\n        \"score\",\n        \"model_rank\",\n        \"save_weights_only\",\n        \"monitor\",\n        \"mode\",\n    }\n)\n_SUPPORTED_AUTO_FEATURES: Final = _RUN_AUTO_FEATURES | _ARTIFACT_AUTO_FEATURES\nArtifactKind = Literal[\"checkpoint\", \"config\", \"hparams\"]\n\n\n@dataclass(frozen=True)\nclass ArtifactEvent:\n    \"\"\"Common metadata emitted when a checkpoint-related artifact changes.\n\n    The event records the logical artifact key, the local path Lightning wrote,\n    and the trainer that triggered the lifecycle event.\n    \"\"\"\n\n    kind: ArtifactKind\n    key: str\n    local_path: Path\n    trainer: pl.Trainer\n\n\n@dataclass(frozen=True)\nclass ArtifactSavedEvent(ArtifactEvent):\n    \"\"\"Metadata emitted after a checkpoint-related artifact has been persisted.\n\n    `artifact` is intentionally typed generically so downstream integrations can\n    expose their own persisted object while still using the common lifecycle API.\n    `storage_uri` is the stable hand-off value for registries such as ClearML.\n    \"\"\"\n\n    artifact: Any\n    storage_uri: str\n\n\n@dataclass(frozen=True)\nclass ArtifactRemovedEvent(ArtifactEvent):\n    \"\"\"Metadata emitted after a local checkpoint file has been removed.\n\n    Removal currently applies to checkpoint files. Config and hparams artifacts are\n    save-only in the current Lightning integration.\n    \"\"\"\n\n    artifact: Any | None = None\n    storage_uri: str | None = None\n\n\nclass ArtifactObserver(Protocol):\n    \"\"\"Observer notified about checkpoint artifact lifecycle events.\n\n    This is the preferred composition hook for downstream integrations that need\n    to register checkpoints elsewhere after Lamin persistence completes.\n    \"\"\"\n\n    def on_artifact_saved(self, event: ArtifactSavedEvent) -> None: ...\n\n    def on_artifact_removed(self, event: ArtifactRemovedEvent) -> None: ...\n\n\nclass ArtifactPublisher(Protocol):\n    \"\"\"Persistence backend for checkpoint-related artifacts.\n\n    :class:`ArtifactPublishingModelCheckpoint` manages the artifact lifecycle,\n    while publishers encapsulate backend-specific save behavior and storage URI\n    resolution.\n    \"\"\"\n\n    def create_artifact(\n        self,\n        local_path: Path | str,\n        *,\n        key: str,\n        description: str,\n        kind: str | None = None,\n        add_as_input_to_run: bool = False,\n        skip_hash_lookup: bool = False,\n    ) -> Any: ...\n\n    def storage_uri(self, artifact: Any) -> str: ...\n\n\nclass LaminArtifactPublisher:\n    \"\"\"Persist checkpoint-related artifacts into LaminDB.\n\n    This service is intentionally separate from :class:`Checkpoint` so that the\n    checkpoint callback can focus on Lightning behavior and feature handling while\n    persistence details remain replaceable.\n    \"\"\"\n\n    def create_artifact(\n        self,\n        local_path: Path | str,\n        *,\n        key: str,\n        description: str,\n        kind: str | None = None,\n        add_as_input_to_run: bool = False,\n        skip_hash_lookup: bool = False,\n    ) -> ln.Artifact:\n        artifact_kwargs: dict[str, Any] = {\"key\": key, \"description\": description}\n        if kind is not None:\n            artifact_kwargs[\"kind\"] = kind\n        if add_as_input_to_run:\n            artifact_kwargs[\"run\"] = False\n        if skip_hash_lookup:\n            artifact_kwargs[\"skip_hash_lookup\"] = True\n        artifact = ln.Artifact(local_path, **artifact_kwargs)\n        artifact.save()\n        if add_as_input_to_run:\n            track_run_input(artifact, is_run_input=True)\n        return artifact\n\n    def storage_uri(self, artifact: ln.Artifact) -> str:\n        return str(artifact.path)\n\n\ndef save_lightning_features() -> None:\n    \"\"\"Save features to auto-track lightning parameters & metrics.\n\n    Creates the following features under the `lamindb.lightning` feature type if they do not already exist:\n\n    Artifact-level features:\n\n    - `is_best_model` (bool): Whether this checkpoint is the best model.\n    - `is_last_model` (bool): Whether this checkpoint is the most recently saved model.\n    - `score` (float): The monitored metric score.\n    - `model_rank` (int): Rank among all checkpoints (0 = best).\n    - `save_weights_only` (bool): Whether this checkpoint only stores model weights.\n    - `monitor` (str): Metric name this checkpoint uses for comparison.\n    - `mode` (str): Optimization mode (`min` or `max`) used for checkpoint ranking.\n\n    Run-level features:\n\n    - `logger_name` (str): Name from the first Lightning logger.\n    - `logger_version` (str): Version from the first Lightning logger.\n    - `max_epochs` (int): Maximum number of epochs.\n    - `max_steps` (int): Maximum number of training steps.\n    - `precision` (str): Training precision (e.g., \"32\", \"16-mixed\", \"bf16\").\n    - `accumulate_grad_batches` (int): Number of batches to accumulate gradients over.\n    - `gradient_clip_val` (float): Gradient clipping value.\n    - `monitor` (str): Metric name being monitored.\n    - `mode` (str): Optimization mode (\"min\" or \"max\").\n\n    Args:\n        None.\n\n    Example:\n\n        Save the features to the database::\n\n            from lamindb.integrations import lightning as ll\n\n            ll.save_lightning_features()\n    \"\"\"\n    # normal matching fails because of non-matching dtype (__lamindb_lightning__ vs None)\n    if (\n        lightning_feature_type := ln.Feature.filter(\n            name=\"lamindb.lightning\"\n        ).one_or_none()\n    ) is None:\n        lightning_feature_type = ln.Feature(  # type: ignore[call-overload]\n            name=\"lamindb.lightning\",\n            description=\"Auto-generated features tracking lightning parameters & metrics\",\n            is_type=True,\n        )\n        lightning_feature_type._dtype_str = \"__lamindb_lightning__\"\n        lightning_feature_type.save()\n\n    ln.Feature(name=\"is_best_model\", dtype=bool, type=lightning_feature_type).save()\n    ln.Feature(name=\"is_last_model\", dtype=bool, type=lightning_feature_type).save()\n    ln.Feature(name=\"score\", dtype=float, type=lightning_feature_type).save()\n    ln.Feature(name=\"model_rank\", dtype=int, type=lightning_feature_type).save()\n    ln.Feature(name=\"logger_name\", dtype=str, type=lightning_feature_type).save()\n    ln.Feature(name=\"logger_version\", dtype=str, type=lightning_feature_type).save()\n    ln.Feature(name=\"max_epochs\", dtype=int, type=lightning_feature_type).save()\n    ln.Feature(name=\"max_steps\", dtype=int, type=lightning_feature_type).save()\n    ln.Feature(name=\"precision\", dtype=str, type=lightning_feature_type).save()\n    ln.Feature(\n        name=\"accumulate_grad_batches\", dtype=int, type=lightning_feature_type\n    ).save()\n    ln.Feature(\n        name=\"gradient_clip_val\", dtype=float, type=lightning_feature_type\n    ).save()\n    ln.Feature(name=\"monitor\", dtype=str, type=lightning_feature_type).save()\n    ln.Feature(name=\"save_weights_only\", dtype=bool, type=lightning_feature_type).save()\n    ln.Feature(name=\"mode\", dtype=str, type=lightning_feature_type).save()\n\n\nclass FeatureAnnotator:\n    \"\"\"Manages Lightning feature discovery, collection, and annotation.\n\n    This helper encapsulates all feature-related state and logic used by\n    :class:`Checkpoint`.  It handles:\n\n    - Validation of user-specified features at setup time\n    - Discovery of auto-features created by :func:`save_lightning_features`\n    - Collection of run-level and checkpoint-level feature values\n    - Best-model flag management and model rank updates\n\n    The annotator is decoupled from `ModelCheckpoint` state — checkpoint-specific\n    values (`best_model_path`, `current_score`, `mode`, etc.) are passed as\n    explicit arguments to collection methods.\n    \"\"\"\n\n    def __init__(\n        self,\n        features: dict[Literal[\"run\", \"artifact\"], dict[str, Any]] | None = None,\n    ) -> None:\n        user_features = features or {}\n        if invalid_keys := set(user_features) - {\"run\", \"artifact\"}:  # type: ignore\n            raise ValueError(\n                f\"Invalid feature keys: {invalid_keys}. Use 'run' and/or 'artifact'.\"\n            )\n        self._run_features: dict[str, Any] = user_features.get(\"run\", {})\n        self._artifact_features: dict[str, Any] = user_features.get(\"artifact\", {})\n        self._auto_features: dict[str, ln.Feature] = {}\n        self._hparam_features_available: set[str] = set()\n        self._run_features_saved = False\n\n    def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:\n        \"\"\"Validate user features and discover auto-features.\n\n        Must be called during `Checkpoint.setup()` while `trainer.is_global_zero`\n        is `True`.\n        \"\"\"\n        self._validate_user_features()\n        self._attach_user_run_features()\n        self._discover_auto_features()\n        self._discover_hparam_features(trainer, pl_module)\n\n    def _attach_user_run_features(self) -> None:\n        \"\"\"Attach user-specified run features to the active LaminDB run.\"\"\"\n        if ln.context.run and self._run_features:\n            ln.context.run.features.add_values(self._run_features)\n\n    def _validate_user_features(self) -> None:\n        \"\"\"Ensure all user-specified feature names exist in the database.\"\"\"\n        all_feature_names = set(self._run_features) | set(self._artifact_features)\n        if not all_feature_names:\n            return\n        existing = set(\n            ln.Feature.filter(name__in=all_feature_names).values_list(\"name\", flat=True)\n        )\n        missing = [n for n in all_feature_names if n not in existing]\n        if missing:\n            s = \"s\" if len(missing) > 1 else \"\"\n            raise ValueError(\n                f\"Feature{s} {', '.join(missing)} missing. \"\n                f\"Create {'them' if len(missing) > 1 else 'it'} first.\"\n            )\n\n    def _discover_auto_features(self) -> None:\n        \"\"\"Load auto-features scoped to the `lamindb.lightning` feature type.\"\"\"\n        lightning_feature_type = ln.Feature.filter(\n            name=\"lamindb.lightning\", is_type=True\n        ).one_or_none()\n        self._auto_features.clear()\n        if lightning_feature_type is not None:\n            self._auto_features = {\n                f.name: f\n                for f in ln.Feature.filter(\n                    name__in=_SUPPORTED_AUTO_FEATURES,\n                    type=lightning_feature_type,\n                )\n            }\n\n    def _discover_hparam_features(\n        self, trainer: pl.Trainer, pl_module: pl.LightningModule\n    ) -> None:\n        \"\"\"Find which hyperparameter names have matching Features in the DB.\"\"\"\n        hparam_names = self._collect_hparam_names(pl_module, trainer.datamodule)\n        self._hparam_features_available = (\n            set(ln.Feature.filter(name__in=hparam_names).values_list(\"name\", flat=True))\n            if hparam_names\n            else set()\n        )\n\n    @staticmethod\n    def _collect_hparam_names(*sources: Any) -> set[str]:\n        \"\"\"Gather hyperparameter names from one or more sources.\"\"\"\n        names: set[str] = set()\n        for source in sources:\n            if source is not None and hasattr(source, \"hparams\") and source.hparams:\n                names.update(source.hparams.keys())\n        return names\n\n    def get(self, name: str) -> ln.Feature | None:\n        \"\"\"Return the typed auto-feature for *name*, or `None`.\"\"\"\n        return self._auto_features.get(name)\n\n    def _set(self, target: dict[str | ln.Feature, Any], name: str, value: Any) -> None:\n        \"\"\"Add *value* to *target* if the auto-feature *name* is tracked and *value* is not `None`.\"\"\"\n        if (feature := self.get(name)) and value is not None:\n            target[feature] = value\n\n    def save_run_features(\n        self,\n        trainer: pl.Trainer,\n        monitor: str | None,\n        mode: str,\n    ) -> None:\n        \"\"\"Collect and attach run-level features once per run.\n\n        Idempotent — subsequent calls are no-ops.\n        \"\"\"\n        if not ln.context.run or self._run_features_saved:\n            return\n\n        run_features = self._collect_run_features(trainer, monitor, mode)\n        if run_features:\n            ln.context.run.features.add_values(run_features)\n        self._run_features_saved = True\n\n    def _collect_run_features(\n        self,\n        trainer: pl.Trainer,\n        monitor: str | None,\n        mode: str,\n    ) -> dict[str | ln.Feature, Any]:\n        \"\"\"Build the dict of run-level feature values (pure, no DB writes).\"\"\"\n        run_features: dict[str | ln.Feature, Any] = {}\n\n        if trainer.loggers:\n            self._set(run_features, \"logger_name\", trainer.loggers[0].name)\n            version = trainer.loggers[0].version\n            self._set(\n                run_features,\n                \"logger_version\",\n                version if isinstance(version, str) else f\"version_{version}\",\n            )\n\n        # Trainer config values\n        self._add_trainer_config_features(run_features, trainer, monitor, mode)\n\n        # Hyperparameters\n        self._add_hparam_features(\n            run_features, trainer.lightning_module, trainer.datamodule\n        )\n\n        return run_features\n\n    def _add_trainer_config_features(\n        self,\n        target: dict[str | ln.Feature, Any],\n        trainer: pl.Trainer,\n        monitor: str | None,\n        mode: str,\n    ) -> None:\n        \"\"\"Append trainer configuration values to *target*.\"\"\"\n        self._set(target, \"max_epochs\", trainer.max_epochs)\n        self._set(target, \"max_steps\", trainer.max_steps)\n        self._set(target, \"precision\", str(trainer.precision))\n        self._set(target, \"accumulate_grad_batches\", trainer.accumulate_grad_batches)\n        self._set(target, \"gradient_clip_val\", trainer.gradient_clip_val)\n        self._set(target, \"monitor\", monitor)\n        self._set(target, \"mode\", mode)\n\n    def _add_hparam_features(\n        self,\n        target: dict[str | ln.Feature, Any],\n        *sources: Any,\n    ) -> None:\n        \"\"\"Append hyperparameter values from one or more sources to *target*.\"\"\"\n        for source in sources:\n            if source is None:\n                continue\n            if hasattr(source, \"hparams\") and source.hparams:\n                for name, value in source.hparams.items():\n                    if name in self._hparam_features_available:\n                        target[name] = value\n\n    def collect_checkpoint_features(\n        self,\n        trainer: pl.Trainer,\n        is_best: bool,\n        current_score: Any | None,\n        save_weights_only: bool,\n        monitor: str | None,\n        mode: str,\n    ) -> dict[str | ln.Feature, Any]:\n        \"\"\"Collect feature values for a checkpoint artifact.\n\n        All `ModelCheckpoint` state is passed as explicit arguments so the\n        annotator stays decoupled from the callback class hierarchy.\n\n        Does **not** mutate existing artifacts — call\n        :meth:`clear_best_model_flags` or :meth:`clear_last_model_flags`\n        separately when needed.\n        \"\"\"\n        feature_values: dict[str | ln.Feature, Any] = {}\n\n        self._set(feature_values, \"is_best_model\", is_best)\n        self._set(feature_values, \"is_last_model\", True)\n\n        if current_score is not None:\n            score = current_score\n            if hasattr(score, \"item\"):\n                score = score.item()\n            self._set(feature_values, \"score\", float(score))\n        self._set(feature_values, \"save_weights_only\", save_weights_only)\n        self._set(feature_values, \"monitor\", monitor)\n        self._set(feature_values, \"mode\", mode)\n\n        # User-specified artifact features\n        for name, value in self._artifact_features.items():\n            if value is not None:\n                feature_values[name] = value\n            elif hasattr(trainer, name):\n                feature_values[name] = getattr(trainer, name)\n            elif name in trainer.callback_metrics:\n                metric = trainer.callback_metrics[name]\n                feature_values[name] = (\n                    metric.item() if hasattr(metric, \"item\") else float(metric)\n                )\n        return feature_values\n\n    def clear_best_model_flags(self, checkpoint_key_prefix: str) -> None:\n        \"\"\"Set `is_best_model=False` on previous best checkpoints.\"\"\"\n        self._clear_flagged_model_feature(\"is_best_model\", checkpoint_key_prefix)\n\n    def clear_last_model_flags(self, checkpoint_key_prefix: str) -> None:\n        \"\"\"Set `is_last_model=False` on previous latest checkpoints.\"\"\"\n        self._clear_flagged_model_feature(\"is_last_model\", checkpoint_key_prefix)\n\n    def _clear_flagged_model_feature(\n        self,\n        feature_name: Literal[\"is_best_model\", \"is_last_model\"],\n        checkpoint_key_prefix: str,\n    ) -> None:\n        \"\"\"Set a boolean model flag to `False` on previously flagged checkpoints.\"\"\"\n        feature = self.get(feature_name)\n        if feature is None:\n            return\n        feature_rows = self._get_artifact_feature_rows(\n            {feature_name}, checkpoint_key_prefix\n        )\n        artifact_ids = [\n            artifact_id\n            for artifact_id, values in feature_rows.items()\n            if values.get(feature_name) is True\n        ]\n        if not artifact_ids:\n            return\n        artifacts_by_id = {a.id: a for a in ln.Artifact.filter(id__in=artifact_ids)}\n        for artifact_id in artifact_ids:\n            if artifact_id not in artifacts_by_id:\n                continue\n            artifact = artifacts_by_id[artifact_id]\n            artifact.features.remove_values(feature, value=True)\n            artifact.features.add_values({feature: False})\n\n    def update_model_ranks(self, checkpoint_key_prefix: str, mode: str) -> None:\n        \"\"\"Re-rank all checkpoint artifacts under *checkpoint_key_prefix*.\"\"\"\n        model_rank_feature = self.get(\"model_rank\")\n        if model_rank_feature is None:\n            return\n        feature_rows = self._get_artifact_feature_rows(\n            {\"score\", \"model_rank\"}, checkpoint_key_prefix\n        )\n        scored = []\n        for artifact_id, values in feature_rows.items():\n            if \"score\" in values:\n                scored.append((values[\"score\"], values.get(\"model_rank\"), artifact_id))\n        scored.sort(key=lambda x: x[0], reverse=(mode == \"max\"))\n\n        artifact_ids = [artifact_id for _, _, artifact_id in scored]\n        artifacts_by_id = {a.id: a for a in ln.Artifact.filter(id__in=artifact_ids)}\n        for rank, (_, old_rank, artifact_id) in enumerate(scored):\n            if artifact_id not in artifacts_by_id:\n                continue\n            af = artifacts_by_id[artifact_id]\n            if old_rank is not None:\n                af.features.remove_values(model_rank_feature, value=old_rank)\n            af.features.add_values({model_rank_feature: rank})\n\n    def _get_artifact_feature_rows(\n        self,\n        feature_names: set[str],\n        checkpoint_key_prefix: str,\n    ) -> dict[int, dict[str, Any]]:\n        \"\"\"Query feature values for checkpoint artifacts under *checkpoint_key_prefix*.\n\n        Returns a dict keyed by artifact ID, where each value is a dict mapping\n        feature name to its stored value.  Example::\n\n            {\n                42: {\"score\": 0.95, \"is_best_model\": True},\n                71: {\"score\": 0.87, \"is_best_model\": False, \"model_rank\": 1},\n            }\n        \"\"\"\n        feature_ids = [\n            feature.id for name in feature_names if (feature := self.get(name))\n        ]\n        key_startswith = checkpoint_key_prefix + \"/\"\n        if feature_ids:\n            rows = ln.models.ArtifactJsonValue.filter(\n                artifact__key__startswith=key_startswith,\n                jsonvalue__feature_id__in=feature_ids,\n            ).values_list(\"artifact_id\", \"jsonvalue__feature__name\", \"jsonvalue__value\")\n        else:\n            rows = ln.models.ArtifactJsonValue.filter(\n                artifact__key__startswith=key_startswith,\n                jsonvalue__feature__name__in=feature_names,\n            ).values_list(\"artifact_id\", \"jsonvalue__feature__name\", \"jsonvalue__value\")\n        result: dict[int, dict[str, Any]] = {}\n        for artifact_id, feature_name, value in rows:\n            if artifact_id not in result:\n                result[artifact_id] = {}\n            result[artifact_id][feature_name] = value\n        return result\n\n\nclass ArtifactPublishingModelCheckpoint(ModelCheckpoint):\n    \"\"\"ModelCheckpoint with observable artifact lifecycle hooks.\n\n    This layer captures artifact kinds, observer registration, saved/removed\n        events, latest artifact tracking, and key compatibility hooks. Concrete\n        subclasses remain responsible for how artifacts are persisted.\n\n        Subclasses are expected to implement:\n\n        - :meth:`resolve_artifact_key` to map local files to logical artifact keys\n        - :meth:`resolve_artifact_storage_uri` to expose a stable backend URI\n        - :meth:`save_checkpoint_artifact`, :meth:`save_config_artifact`, and\n            :meth:`save_hparams_artifact` to persist files\n\n        :class:`SaveConfigCallback` only depends on this base class, which means a\n        custom checkpoint callback can participate in config saving without inheriting\n        from Lamin's concrete :class:`Checkpoint`.\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        artifact_observers: list[ArtifactObserver] | None = None,\n        **kwargs: Any,\n    ) -> None:\n        super().__init__(*args, **kwargs)\n        self._artifact_observers: list[ArtifactObserver] = list(\n            artifact_observers or []\n        )\n        self._latest_artifacts: dict[ArtifactKind, Any | None] = {\n            \"checkpoint\": None,\n            \"config\": None,\n            \"hparams\": None,\n        }\n        self._last_artifact_event: ArtifactSavedEvent | ArtifactRemovedEvent | None = (\n            None\n        )\n\n    @property\n    def last_checkpoint_artifact(self) -> Any | None:\n        \"\"\"The most recently saved checkpoint artifact handle.\"\"\"\n        return self._latest_artifacts[\"checkpoint\"]\n\n    @property\n    def last_config_artifact(self) -> Any | None:\n        \"\"\"The most recently saved config artifact handle.\"\"\"\n        return self._latest_artifacts[\"config\"]\n\n    @property\n    def last_hparams_artifact(self) -> Any | None:\n        \"\"\"The most recently saved hparams artifact handle.\"\"\"\n        return self._latest_artifacts[\"hparams\"]\n\n    @property\n    def last_artifact_event(self) -> ArtifactSavedEvent | ArtifactRemovedEvent | None:\n        \"\"\"The last artifact lifecycle event emitted by this callback.\"\"\"\n        return self._last_artifact_event\n\n    def get_last_artifact(self, kind: ArtifactKind) -> Any | None:\n        \"\"\"Return the most recently saved artifact for a given artifact kind.\"\"\"\n        return self._latest_artifacts[kind]\n\n    def add_artifact_observer(self, observer: ArtifactObserver) -> None:\n        \"\"\"Register an observer notified about artifact lifecycle events.\"\"\"\n        self._artifact_observers.append(observer)\n\n    def remove_artifact_observer(self, observer: ArtifactObserver) -> None:\n        \"\"\"Unregister a previously added artifact observer.\"\"\"\n        self._artifact_observers.remove(observer)\n\n    def resolve_artifact_storage_uri(self, artifact: Any) -> str:\n        \"\"\"Resolve the physical location for a persisted artifact.\"\"\"\n        raise NotImplementedError\n\n    def resolve_artifact_key(\n        self,\n        trainer: pl.Trainer,\n        filepath: Path | str,\n        kind: ArtifactKind,\n    ) -> str:\n        \"\"\"Return the logical artifact key for a checkpoint-related file.\"\"\"\n        raise NotImplementedError\n\n    def _notify_artifact_saved(\n        self,\n        trainer: pl.Trainer,\n        *,\n        kind: ArtifactKind,\n        key: str,\n        artifact: Any,\n        local_path: Path | str,\n    ) -> ArtifactSavedEvent:\n        event = ArtifactSavedEvent(\n            kind=kind,\n            key=key,\n            local_path=Path(local_path),\n            trainer=trainer,\n            artifact=artifact,\n            storage_uri=self.resolve_artifact_storage_uri(artifact),\n        )\n        self._latest_artifacts[kind] = artifact\n        self._last_artifact_event = event\n        self.on_artifact_saved(event)\n        self._notify_artifact_observers(\"on_artifact_saved\", event)\n        return event\n\n    def _notify_artifact_removed(\n        self,\n        trainer: pl.Trainer,\n        *,\n        kind: ArtifactKind,\n        key: str,\n        local_path: Path | str,\n        artifact: Any | None,\n    ) -> ArtifactRemovedEvent:\n        storage_uri = None\n        if artifact is not None:\n            storage_uri = self.resolve_artifact_storage_uri(artifact)\n        event = ArtifactRemovedEvent(\n            kind=kind,\n            key=key,\n            local_path=Path(local_path),\n            trainer=trainer,\n            artifact=artifact,\n            storage_uri=storage_uri,\n        )\n        self._last_artifact_event = event\n        self.on_artifact_removed(event)\n        self._notify_artifact_observers(\"on_artifact_removed\", event)\n        return event\n\n    def _notify_artifact_observers(\n        self,\n        method_name: str,\n        event: ArtifactSavedEvent | ArtifactRemovedEvent,\n    ) -> None:\n        for observer in tuple(self._artifact_observers):\n            method = getattr(observer, method_name, None)\n            if callable(method):\n                method(event)\n\n    def on_artifact_saved(self, event: ArtifactSavedEvent) -> None:\n        \"\"\"Hook for subclasses after an artifact has been saved.\"\"\"\n        del event\n\n    def on_artifact_removed(self, event: ArtifactRemovedEvent) -> None:\n        \"\"\"Hook for subclasses after a checkpoint file has been removed.\"\"\"\n        del event\n\n    def save_checkpoint_artifact(\n        self,\n        trainer: pl.Trainer,\n        filepath: Path | str,\n        *,\n        feature_values: dict[str, Any] | None = None,\n    ) -> Any:\n        \"\"\"Persist a checkpoint artifact and emit the corresponding event.\"\"\"\n        del trainer, filepath, feature_values\n        raise NotImplementedError\n\n    def save_config_artifact(self, trainer: pl.Trainer, config_path: Path | str) -> Any:\n        \"\"\"Persist a config artifact and emit the corresponding event.\"\"\"\n        del trainer, config_path\n        raise NotImplementedError\n\n    def save_hparams_artifact(\n        self, trainer: pl.Trainer, hparams_path: Path | str\n    ) -> Any | None:\n        \"\"\"Persist an hparams artifact and emit the corresponding event.\"\"\"\n        del trainer, hparams_path\n        raise NotImplementedError\n\n\nclass Checkpoint(ArtifactPublishingModelCheckpoint):\n    \"\"\"A `ModelCheckpoint` that annotates `pytorch` `lightning` checkpoints.\n\n    Extends `lightning`'s `ModelCheckpoint` with artifact creation & feature annotation.\n    Each checkpoint is a separate artifact whose key is derived from either the\n    explicit `dirpath` or the trainer's logger configuration.\n\n    When `dirpath` is omitted (recommended), Lightning decides where to store\n    checkpoints locally (typically `lightning_logs/version_N/checkpoints/`)\n    and the artifact key is derived from the logger's `save_dir`, `name`,\n    and `version`.  When `dirpath` is provided, it is used directly as the\n    key prefix.\n\n    All artifacts are scoped under a single **base prefix**.  Checkpoints\n    (and `hparams.yaml`) live under `{base}/checkpoints/`; other artifacts\n    (e.g. `config.yaml`) live directly under `{base}/`.\n\n    Base prefix derivation (highest priority first):\n\n    1. `dirpath` provided → `{dirpath}` (logger is ignored for key purposes)\n    2. `dirpath` omitted, logger present → `{save_dir_basename}/{name}/{version}`\n    3. `dirpath` omitted, no logger → empty\n\n    When `run_uid_is_version` is `True` (the default) and a Lamin run context\n    is active, the run UID is incorporated into the base prefix:\n\n    - Case 1/3: the run UID is appended as an extra path segment\n      (e.g. `my/dir/{run_uid}`, or just `{run_uid}`).\n    - Case 2: the logger's auto-incremented `version` is *replaced* by the\n      run UID (`{save_dir_basename}/{name}/{run_uid}`).\n\n    Resulting key layout (with run UID active)::\n\n        {base}/checkpoints/epoch=0-step=100.ckpt\n        {base}/checkpoints/hparams.yaml\n        {base}/config.yaml\n\n    If available in the database through `save_lightning_features()`, the following `lamindb.lightning` features are automatically tracked:\n\n    - Artifact-level: `is_best_model`, `is_last_model`, `score`, `model_rank`, `save_weights_only`, `monitor`, `mode`\n    - Run-level: `logger_name`, `logger_version`, `max_epochs`, `max_steps`, `precision`, `accumulate_grad_batches`, `gradient_clip_val`, `monitor`, `mode`\n\n    Additionally, model hyperparameters (from `pl_module.hparams`) and datamodule hyperparameters\n    (from `trainer.datamodule.hparams`) are captured if corresponding features exist.\n\n    This is the concrete LaminDB implementation built on top of\n    :class:`ArtifactPublishingModelCheckpoint`. Use it when you want LaminDB to be\n    the persistence layer. For secondary systems such as ClearML, prefer attaching\n    an :class:`ArtifactObserver` or subclassing :class:`Checkpoint` and reacting in\n    :meth:`on_artifact_saved`.\n\n    Args:\n        dirpath: Directory for checkpoints.  When provided, also used as the\n            artifact key prefix.  When omitted (recommended), Lightning picks\n            the local directory and the key prefix is derived from the logger.\n        features: Features to annotate runs and artifacts.\n            Use \"run\" key for run-level features (static metadata).\n            Use \"artifact\" key for artifact-level features (values can be static or None for auto-population from trainer metrics/attributes).\n        monitor: Quantity to monitor for saving best checkpoint.\n        verbose: Verbosity mode.\n        save_last: Save a copy of the last checkpoint.\n        save_top_k: Number of best checkpoints to keep.\n        save_weights_only: Save only model weights (not optimizer state).\n        mode: One of \"min\" or \"max\" for monitor comparison.\n        auto_insert_metric_name: Include metric name in checkpoint filename.\n        every_n_train_steps: Checkpoint every N training steps.\n        train_time_interval: Checkpoint at time intervals.\n        every_n_epochs: Checkpoint every N epochs.\n        save_on_train_epoch_end: Run checkpointing at end of training epoch.\n        enable_version_counter: Append version to filename to avoid collisions.\n        run_uid_is_version: When `True` (default) and a Lamin run context is\n            active, incorporate the run UID into the base prefix. For the\n            logger case the logger's auto-incremented version is replaced;\n            for the dirpath and no-logger cases the run UID is appended as\n            an extra path segment. Prevents cross-run key collisions.\n        artifact_observers: Optional observer objects notified when checkpoint,\n            config, or hparams artifacts are saved or when checkpoint files are\n            removed locally. Observers follow :class:`ArtifactObserver` and\n            receive :class:`ArtifactSavedEvent` and :class:`ArtifactRemovedEvent`.\n\n    Examples:\n\n        Let Lightning decide where to store checkpoints (recommended)::\n\n            import lightning as pl\n            from lightning.pytorch.loggers import CSVLogger\n            from lamindb.integrations import lightning as ll\n\n            ll.save_lightning_features()\n\n            callback = ll.Checkpoint(monitor=\"val_loss\", save_top_k=3)\n            logger = CSVLogger(save_dir=\"logs\")\n\n            trainer = pl.Trainer(callbacks=[callback], logger=logger)\n            trainer.fit(model, dataloader)\n\n            # Query checkpoints — key prefix is derived from the logger\n            # e.g. \"logs/lightning_logs/version_0/checkpoints/\"\n            ln.Artifact.filter(key__startswith=callback.checkpoint_key_prefix)\n\n        Explicit `dirpath` for full control over the artifact key prefix::\n\n            callback = ll.Checkpoint(\n                dirpath=\"deployments/my_model/\",\n                monitor=\"val_loss\",\n                save_top_k=3,\n            )\n\n            trainer = pl.Trainer(callbacks=[callback])\n            trainer.fit(model, dataloader)\n\n            # Query checkpoints\n            ln.Artifact.filter(key__startswith=callback.checkpoint_key_prefix)\n\n        Using the CLI::\n\n            # config.yaml\n            trainer:\n              callbacks:\n                - class_path: lamindb.integrations.lightning.Checkpoint\n                  init_args:\n                    monitor: val_loss\n                    save_top_k: 3\n\n            # Run with:\n            # python main.py fit --config config.yaml\n\n        For more, see the guide: :doc:`lightning`.\n    \"\"\"\n\n    def __init__(\n        self,\n        dirpath: _PATH | None = None,\n        *,\n        features: dict[Literal[\"run\", \"artifact\"], dict[str, Any]] | None = None,\n        monitor: str | None = None,\n        verbose: bool = False,\n        save_last: bool | None = None,\n        save_top_k: int = 1,\n        save_weights_only: bool = False,\n        mode: Literal[\"min\", \"max\"] = \"min\",\n        auto_insert_metric_name: bool = True,\n        every_n_train_steps: int | None = None,\n        train_time_interval: timedelta | None = None,\n        every_n_epochs: int | None = None,\n        save_on_train_epoch_end: bool | None = None,\n        enable_version_counter: bool = True,\n        run_uid_is_version: bool = True,\n        artifact_observers: list[ArtifactObserver] | None = None,\n    ) -> None:\n        self._original_dirpath = dirpath\n        super().__init__(\n            dirpath=dirpath,\n            monitor=monitor,\n            verbose=verbose,\n            save_last=save_last,\n            save_top_k=save_top_k,\n            save_weights_only=save_weights_only,\n            mode=mode,\n            auto_insert_metric_name=auto_insert_metric_name,\n            every_n_train_steps=every_n_train_steps,\n            train_time_interval=train_time_interval,\n            every_n_epochs=every_n_epochs,\n            save_on_train_epoch_end=save_on_train_epoch_end,\n            enable_version_counter=enable_version_counter,\n            artifact_observers=artifact_observers,\n        )\n        self._feature_annotator = FeatureAnnotator(features)\n        self._hparams_yaml_saved = False\n        self._run_uid_is_version = run_uid_is_version\n        self._trainer: pl.Trainer | None = None\n        self._artifact_publisher: ArtifactPublisher = LaminArtifactPublisher()\n\n    def setup(\n        self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str\n    ) -> None:\n        \"\"\"Validate user features and detect available auto-features.\"\"\"\n        super().setup(trainer, pl_module, stage)\n        self._trainer = trainer\n\n        if self.save_last:\n            warnings.warn(\n                \"save_last is not necessary with Lamin. Checkpoint metadata\"\n                \" (is_best_model, is_last_model, model_rank, score) makes the latest checkpoint\"\n                \" queryable without encoding this in the filename. Consider\"\n                \" disabling save_last to avoid redundant checkpoint copies.\",\n                UserWarning,\n                stacklevel=2,\n            )\n\n        if trainer.is_global_zero:\n            self._feature_annotator.setup(trainer, pl_module)\n\n    def _base_prefix(self, trainer: pl.Trainer) -> str:\n        \"\"\"Compute the base artifact key prefix.\n\n        The base prefix is the root namespace for all artifacts produced by\n        this callback.  Checkpoints live under `{base}/checkpoints/` and\n        other files (config, hparams) directly under `{base}/`.\n\n        Priority: explicit `dirpath` > logger > run UID > empty.\n        \"\"\"\n        run_uid = self._active_run_uid()\n        if self._original_dirpath is not None:\n            prefix = str(self._original_dirpath).rstrip(\"/\")\n            return f\"{prefix}/{run_uid}\" if run_uid else prefix\n        if len(trainer.loggers) > 0:\n            return self._logger_prefix(trainer, run_uid)\n        return run_uid or \"\"\n\n    def _active_run_uid(self) -> str | None:\n        \"\"\"Return the Lamin run UID when run-UID scoping is active.\"\"\"\n        if self._run_uid_is_version and ln.context.run is not None:\n            return ln.context.run.uid\n        return None\n\n    def _logger_prefix(self, trainer: pl.Trainer, run_uid: str | None) -> str:\n        \"\"\"Derive a key prefix from the trainer's first logger.\"\"\"\n        assert trainer.loggers, \"_logger_prefix requires at least one logger\"\n        logger = trainer.loggers[0]\n        save_dir = logger.save_dir or trainer.default_root_dir\n        name = str(logger.name).rstrip(\"/\")\n        if run_uid:\n            version = run_uid\n        else:\n            version = logger.version\n            version = version if isinstance(version, str) else f\"version_{version}\"\n        return f\"{Path(save_dir).name}/{name}/{version.rstrip('/')}\"\n\n    @property\n    def base_prefix(self) -> str:\n        \"\"\"The base artifact key prefix for all artifacts from this callback.\n\n        Checkpoints live under `{base_prefix}/checkpoints/` and configs\n        directly under `{base_prefix}/`.\n\n        Available after `setup()` has been called.\n        \"\"\"\n        assert self._trainer is not None, \"base_prefix is only available after setup()\"\n        return self._base_prefix(self._trainer)\n\n    @property\n    def checkpoint_key_prefix(self) -> str:\n        \"\"\"The artifact key prefix used for checkpoint artifacts.\n\n        Available after `setup()` has been called, for example once\n        `trainer.fit()` has started.\n        \"\"\"\n        base = self.base_prefix\n        return f\"{base}/checkpoints\" if base else \"checkpoints\"\n\n    def resolve_artifact_storage_uri(self, artifact: ln.Artifact) -> str:\n        \"\"\"Resolve the physical artifact location for downstream registries.\n\n        This is the stable abstraction external packages should use instead of\n        reconstructing storage locations from Lamin internals.\n        \"\"\"\n        return self._artifact_publisher.storage_uri(artifact)\n\n    def resolve_artifact_key(\n        self,\n        trainer: pl.Trainer,\n        filepath: Path | str,\n        kind: ArtifactKind,\n    ) -> str:\n        \"\"\"Return the Lamin artifact key for a checkpoint-related file.\"\"\"\n        base = self._base_prefix(trainer)\n        if kind in {\"checkpoint\", \"hparams\"}:\n            prefix = f\"{base}/checkpoints\" if base else \"checkpoints\"\n        else:\n            prefix = base\n        if prefix:\n            return f\"{prefix}/{Path(filepath).name}\"\n        return Path(filepath).name\n\n    def _create_lamin_artifact(\n        self,\n        local_path: Path | str,\n        *,\n        key: str,\n        description: str,\n        kind: str | None = None,\n        add_as_input_to_run: bool = False,\n        skip_hash_lookup: bool = False,\n    ) -> ln.Artifact:\n        return self._artifact_publisher.create_artifact(\n            local_path,\n            key=key,\n            description=description,\n            kind=kind,\n            add_as_input_to_run=add_as_input_to_run,\n            skip_hash_lookup=skip_hash_lookup,\n        )\n        self._feature_annotator.clear_last_model_flags(self.checkpoint_key_prefix)\n\n    def save_checkpoint_artifact(\n        self,\n        trainer: pl.Trainer,\n        filepath: Path | str,\n        *,\n        feature_values: dict[str | ln.Feature, Any] | None = None,\n    ) -> ln.Artifact:\n        \"\"\"Save a checkpoint artifact to Lamin and emit the corresponding event.\n\n        This is the main persistence hook used by :meth:`_save_checkpoint`. It is a\n        useful override point for subclasses that want to augment Lamin persistence\n        while keeping the generic lifecycle behavior from the base class.\n        \"\"\"\n        key = self.resolve_artifact_key(\n            trainer=trainer, filepath=filepath, kind=\"checkpoint\"\n        )\n        existing_artifact = ln.Artifact.filter(key=key).one_or_none()\n        if existing_artifact is not None:\n            existing_artifact.delete(permanent=True, storage=True)\n        artifact = self._create_lamin_artifact(\n            filepath,\n            key=key,\n            description=\"model checkpoint\",\n            kind=\"model\",\n            skip_hash_lookup=True,\n        )\n        if feature_values:\n            artifact.features.add_values(feature_values)\n        self._notify_artifact_saved(\n            trainer,\n            kind=\"checkpoint\",\n            key=key,\n            artifact=artifact,\n            local_path=filepath,\n        )\n        return artifact\n\n    def save_config_artifact(\n        self, trainer: pl.Trainer, config_path: Path | str\n    ) -> ln.Artifact:\n        \"\"\"Save a Lightning CLI config artifact and emit the corresponding event.\n\n        Config artifacts are routed through the same lifecycle surface as\n        checkpoints so observers and subclasses see a unified event stream.\n        \"\"\"\n        key = self.resolve_artifact_key(\n            trainer=trainer, filepath=config_path, kind=\"config\"\n        )\n        artifact = self._create_lamin_artifact(\n            config_path,\n            key=key,\n            description=\"Lightning CLI config\",\n            kind=\"config\",\n            add_as_input_to_run=True,\n            skip_hash_lookup=True,\n        )\n        self._notify_artifact_saved(\n            trainer,\n            kind=\"config\",\n            key=key,\n            artifact=artifact,\n            local_path=config_path,\n        )\n        return artifact\n\n    def save_hparams_artifact(\n        self, trainer: pl.Trainer, hparams_path: Path | str\n    ) -> ln.Artifact | None:\n        \"\"\"Save Lightning's auto-generated hparams file and emit the event.\n\n        Returns `None` if Lightning did not generate `hparams.yaml` for the\n        current run.\n        \"\"\"\n        if not Path(hparams_path).exists():\n            return None\n\n        key = self.resolve_artifact_key(\n            trainer=trainer, filepath=hparams_path, kind=\"hparams\"\n        )\n        artifact = self._create_lamin_artifact(\n            hparams_path,\n            key=key,\n            description=\"Lightning run hyperparameters\",\n            kind=\"config\",\n            skip_hash_lookup=True,\n        )\n        self._notify_artifact_saved(\n            trainer,\n            kind=\"hparams\",\n            key=key,\n            artifact=artifact,\n            local_path=hparams_path,\n        )\n        return artifact\n\n    def _save_hparams_yaml(self, trainer: pl.Trainer) -> None:\n        \"\"\"Persist Lightning's auto-generated hparams file once per run.\"\"\"\n        if self._hparams_yaml_saved:\n            return\n\n        log_dir = trainer.log_dir\n        if not log_dir:\n            return\n\n        hparams_path = Path(log_dir) / \"hparams.yaml\"\n        if not hparams_path.exists():\n            return\n\n        if self.save_hparams_artifact(trainer, hparams_path) is not None:\n            self._hparams_yaml_saved = True\n\n    def _save_checkpoint(self, trainer: pl.Trainer, filepath: str) -> None:\n        \"\"\"Save checkpoint to the instance.\"\"\"\n        super()._save_checkpoint(trainer, filepath)\n\n        if not trainer.is_global_zero:\n            return\n        self._save_hparams_yaml(trainer)\n\n        self._feature_annotator.save_run_features(\n            trainer, monitor=self.monitor, mode=self.mode\n        )\n        self._feature_annotator.clear_last_model_flags(self.checkpoint_key_prefix)\n        is_best = self.best_model_path == str(filepath)\n        feature_values = self._feature_annotator.collect_checkpoint_features(\n            trainer,\n            is_best=is_best,\n            current_score=self.current_score,\n            save_weights_only=self.save_weights_only,\n            monitor=self.monitor,\n            mode=self.mode,\n        )\n\n        if is_best:\n            self._feature_annotator.clear_best_model_flags(self.checkpoint_key_prefix)\n\n        self.save_checkpoint_artifact(trainer, filepath, feature_values=feature_values)\n\n        self._feature_annotator.update_model_ranks(\n            self.checkpoint_key_prefix, mode=self.mode\n        )\n\n    def _remove_checkpoint(self, trainer: pl.Trainer, filepath: str) -> None:\n        \"\"\"Remove the local checkpoint file and emit a removal event.\"\"\"\n        artifact: ln.Artifact | None = None\n        key = self.resolve_artifact_key(\n            trainer=trainer, filepath=filepath, kind=\"checkpoint\"\n        )\n        if trainer.is_global_zero:\n            artifact = ln.Artifact.filter(key=key).one_or_none()\n        super()._remove_checkpoint(trainer, filepath)\n        if trainer.is_global_zero:\n            self._notify_artifact_removed(\n                trainer,\n                kind=\"checkpoint\",\n                key=key,\n                local_path=filepath,\n                artifact=artifact,\n            )\n            if artifact is not None:\n                artifact.delete(permanent=True, storage=True)\n\n\nclass SaveConfigCallback(_SaveConfigCallback):\n    \"\"\"SaveConfigCallback that also saves config to the instance.\n\n    Use with LightningCLI to save the resolved configuration file alongside checkpoints.\n\n    The local config file is saved under `{save_dir}/{name}/{version}/`\n    derived from the first logger, avoiding Lightning's `trainer.log_dir`\n    which hardcodes an `isinstance` check for `TensorBoardLogger` /\n    `CSVLogger` and silently changes the directory for other loggers.\n\n    This callback looks for any :class:`ArtifactPublishingModelCheckpoint`, not just\n    Lamin's concrete :class:`Checkpoint`. That keeps the config-save path aligned\n    with custom subclasses built on the generic artifact-publishing base.\n\n    Config artifacts are stored directly under the **base prefix** of the\n    active :class:`Checkpoint` callback.  The base prefix follows the same\n    derivation rules as for checkpoints (dirpath > logger > empty), so\n    configs are always co-located with their checkpoints:\n\n    - `Checkpoint.dirpath` set → `{dirpath}/config.yaml`\n      (`{dirpath}/{run_uid}/config.yaml` with run-UID scoping)\n    - Logger present, no `dirpath` → `{save_dir_basename}/{name}/{version}/config.yaml`\n    - Neither → `config.yaml` (or `{run_uid}/config.yaml` with run-UID scoping)\n\n    Example::\n\n        from lightning.pytorch.cli import LightningCLI\n        from lamindb.integrations import lightning as ll\n\n        cli = LightningCLI(\n            MyModel,\n            MyDataModule,\n            save_config_callback=ll.SaveConfigCallback,\n        )\n    \"\"\"\n\n    def setup(\n        self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str\n    ) -> None:\n        \"\"\"Save resolved configuration file alongside checkpoints.\"\"\"\n        if self.already_saved:  # type: ignore\n            return\n\n        if self.save_to_log_dir:\n            config_path = self._config_path(trainer)\n\n            if not self.overwrite:\n                file_exists = config_path.exists() if trainer.is_global_zero else False\n                file_exists = trainer.strategy.broadcast(file_exists)\n                if file_exists:\n                    raise RuntimeError(f\"Config file already exists: {config_path}\")\n\n            if trainer.is_global_zero:\n                config_path.parent.mkdir(exist_ok=True, parents=True)\n                self.parser.save(\n                    self.config,\n                    config_path,\n                    skip_none=False,\n                    overwrite=self.overwrite,\n                    multifile=self.multifile,\n                )\n                self._save_config(trainer, config_path)\n\n            if trainer.is_global_zero:\n                self.save_config(trainer, pl_module, stage)\n                self.already_saved = True\n            self.already_saved = trainer.strategy.broadcast(self.already_saved)\n\n    def _config_path(self, trainer: pl.Trainer) -> Path:\n        \"\"\"Derive the local config file path from the first logger.\n\n        We intentionally avoid `trainer.log_dir` because Lightning hardcodes\n        an `isinstance` check against `TensorBoardLogger` and `CSVLogger`\n        there.  For those two loggers it uses `logger.log_dir` (which appends\n        name/version), while for every other logger it falls back to\n        `logger.save_dir` (no name/version).  This means the config file\n        location silently changes depending on which logger happens to be first\n        — making it unpredictable for third-party loggers.\n\n        This method always uses `logger.save_dir` + `name` + `version`,\n        giving a consistent directory layout regardless of logger type.\n        \"\"\"\n        if len(trainer.loggers) > 0:\n            first = trainer.loggers[0]\n            save_dir = (\n                first.save_dir\n                if first.save_dir is not None\n                else trainer.default_root_dir\n            )\n            name = first.name\n            version = first.version\n            version = version if isinstance(version, str) else f\"version_{version}\"\n            return Path(save_dir) / str(name) / version / self.config_filename\n        return Path(trainer.default_root_dir) / self.config_filename\n\n    def _save_config(self, trainer: pl.Trainer, config_path: Path) -> None:\n        \"\"\"Persist the resolved config through the active artifact checkpoint.\n\n        If no artifact-publishing checkpoint callback is registered, this becomes a\n        no-op and only Lightning's local config file is written.\n        \"\"\"\n        checkpoint_cb = self._get_artifact_checkpoint_callback(trainer)\n        if checkpoint_cb is None:\n            return\n\n        checkpoint_cb.save_config_artifact(trainer, config_path)\n\n    def _get_artifact_checkpoint_callback(\n        self, trainer: pl.Trainer\n    ) -> ArtifactPublishingModelCheckpoint | None:\n        \"\"\"Find the artifact-publishing checkpoint callback if present.\"\"\"\n        for cb in trainer.callbacks:\n            if isinstance(cb, ArtifactPublishingModelCheckpoint):\n                return cb\n        return None\n\n\n# backwards compatibility\n# We keep the full class around because it's short and it's cumbersome to write\n# full backwards compatibility code because of the rather different interfaces and behavior\nclass Callback(pl.Callback):\n    \"\"\"Saves checkpoints to LaminDB after each training epoch.\n\n    .. deprecated::\n        Use :class:`Checkpoint` instead for new code.\n\n    Args:\n        path: A local path to the checkpoint.\n        key: The `key` for the checkpoint artifact.\n        features: Features to annotate the checkpoint.\n    \"\"\"\n\n    def __init__(\n        self,\n        path: str | Path,\n        key: str,\n        features: dict[str, Any] | None = None,\n    ):\n        warnings.warn(\n            \"ll.Callback is deprecated, use ll.Checkpoint instead\",\n            DeprecationWarning,\n            stacklevel=2,\n        )\n        self.path = Path(path)\n        self.key = key\n        self.features = features or {}\n\n    def on_train_start(\n        self, trainer: pl.Trainer, pl_module: pl.LightningModule\n    ) -> None:\n        \"\"\"Validates that features exist for all specified params.\"\"\"\n        missing = [\n            name\n            for name in self.features\n            if ln.Feature.filter(name=name).one_or_none() is None\n        ]\n        if missing:\n            s = \"s\" if len(missing) > 1 else \"\"\n            raise ValueError(\n                f\"Feature{s} {', '.join(missing)} missing. \"\n                f\"Create {'them' if len(missing) > 1 else 'it'} first.\"\n            )\n\n    def on_train_epoch_end(\n        self, trainer: pl.Trainer, pl_module: pl.LightningModule\n    ) -> None:\n        \"\"\"Saves model checkpoint at the end of each epoch.\"\"\"\n        trainer.save_checkpoint(self.path)\n        artifact = ln.Artifact(self.path, key=self.key, kind=\"model\").save()\n\n        feature_values = dict(self.features)\n        for name in self.features:\n            if hasattr(trainer, name):\n                feature_values[name] = getattr(trainer, name)\n            elif name in trainer.callback_metrics:\n                metric = trainer.callback_metrics[name]\n                feature_values[name] = (\n                    metric.item() if hasattr(metric, \"item\") else float(metric)\n                )\n\n        if feature_values:\n            artifact.features.add_values(feature_values)\n\n\n__all__ = [\n    \"ArtifactObserver\",\n    \"ArtifactEvent\",\n    \"ArtifactPublisher\",\n    \"ArtifactPublishingModelCheckpoint\",\n    \"ArtifactRemovedEvent\",\n    \"ArtifactSavedEvent\",\n    \"Checkpoint\",\n    \"LaminArtifactPublisher\",\n    \"SaveConfigCallback\",\n    \"save_lightning_features\",\n]\n"
  },
  {
    "path": "lamindb/migrations/0177_squashed.py",
    "content": "# Generated by Django 5.2 on 2026-01-10 23:06\n\nimport django.core.validators\nimport django.db.models.deletion\nimport django.db.models.functions.datetime\nimport django.db.models.functions.text\nimport pgtrigger.compiler\nimport pgtrigger.migrations\nfrom django.db import connection, migrations, models\n\nimport lamindb.base.fields\nimport lamindb.base.uids\nimport lamindb.base.users\nimport lamindb.models.can_curate\nimport lamindb.models.has_parents\nimport lamindb.models.run\nimport lamindb.models.sqlrecord\n\nCREATE_IS_VALID_RECORD_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_record_type(record_type_id INTEGER, record_is_type BOOLEAN)\nRETURNS BOOLEAN AS $$\nBEGIN\n    -- Record with no type is valid\n    IF record_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    -- If current record is a type, it can only reference schema-less types\n    IF record_is_type THEN\n        RETURN EXISTS (\n            SELECT 1 FROM lamindb_record r\n            WHERE r.id = record_type_id AND r.is_type AND r.schema_id IS NULL\n        );\n    END IF;\n\n    -- Regular records can reference any type\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_record r\n        WHERE r.id = record_type_id AND r.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_RECORD_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_record\nADD CONSTRAINT record_type_is_valid_fk\nCHECK (is_valid_record_type(type_id, is_type));\n\"\"\"\n\n\nCREATE_IS_VALID_FEATURE_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_feature_type(feature_type_id INTEGER)\nRETURNS BOOLEAN AS $$\nBEGIN\n    -- Feature with no type is valid\n    IF feature_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    -- Type must have is_type = TRUE\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_feature f\n        WHERE f.id = feature_type_id AND f.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_feature\nADD CONSTRAINT feature_type_is_valid_fk\nCHECK (is_valid_feature_type(type_id));\n\"\"\"\n\n\nCREATE_IS_VALID_SCHEMA_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_schema_type(schema_type_id INTEGER)\nRETURNS BOOLEAN AS $$\nBEGIN\n    IF schema_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_schema s\n        WHERE s.id = schema_type_id AND s.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_schema\nADD CONSTRAINT schema_type_is_valid_fk\nCHECK (is_valid_schema_type(type_id));\n\"\"\"\n\n\nCREATE_IS_VALID_PROJECT_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_project_type(project_type_id INTEGER)\nRETURNS BOOLEAN AS $$\nBEGIN\n    IF project_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_project p\n        WHERE p.id = project_type_id AND p.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_project\nADD CONSTRAINT project_type_is_valid_fk\nCHECK (is_valid_project_type(type_id));\n\"\"\"\n\n\nCREATE_IS_VALID_REFERENCE_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_reference_type(reference_type_id INTEGER)\nRETURNS BOOLEAN AS $$\nBEGIN\n    IF reference_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_reference r\n        WHERE r.id = reference_type_id AND r.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_reference\nADD CONSTRAINT reference_type_is_valid_fk\nCHECK (is_valid_reference_type(type_id));\n\"\"\"\n\n\nCREATE_IS_VALID_ULABEL_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_ulabel_type(ulabel_type_id INTEGER)\nRETURNS BOOLEAN AS $$\nBEGIN\n    IF ulabel_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_ulabel u\n        WHERE u.id = ulabel_type_id AND u.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_ulabel\nADD CONSTRAINT ulabel_type_is_valid_fk\nCHECK (is_valid_ulabel_type(type_id));\n\"\"\"\n\n\ndef apply_constraints(apps, schema_editor):\n    if schema_editor.connection.vendor == \"postgresql\":\n        schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_CONSTRAINT)\n        schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT)\n        schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT)\n        schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT)\n        schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT)\n        schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT)\n\n\nclass Migration(migrations.Migration):\n    initial = True\n    dependencies = []  # type: ignore\n    operations = [\n        migrations.CreateModel(\n            name=\"Migration\",\n            fields=[\n                (\n                    \"id\",\n                    models.BigAutoField(\n                        auto_created=True,\n                        primary_key=True,\n                        serialize=False,\n                        verbose_name=\"ID\",\n                    ),\n                ),\n                (\n                    \"app\",\n                    lamindb.base.fields.CharField(\n                        blank=True, default=None, max_length=255\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True, default=None, max_length=255\n                    ),\n                ),\n                (\"applied\", lamindb.base.fields.DateTimeField(blank=True)),\n            ],\n            options={\n                \"db_table\": \"django_migrations\",\n                \"managed\": False,\n            },\n        ),\n        migrations.CreateModel(\n            name=\"Block\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\"key\", models.CharField(db_index=True, max_length=1024)),\n            ],\n        ),\n        migrations.CreateModel(\n            name=\"Branch\",\n            fields=[\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\"name\", models.CharField(db_index=True, max_length=100)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n            ],\n        ),\n        migrations.CreateModel(\n            name=\"Space\",\n            fields=[\n                (\"id\", models.SmallAutoField(primary_key=True, serialize=False)),\n                (\"name\", models.CharField(db_index=True, max_length=100)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n            ],\n        ),\n        migrations.CreateModel(\n            name=\"Artifact\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"key\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=1024,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"_real_key\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=1024,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(\n                        blank=True, db_index=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"suffix\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=30,\n                    ),\n                ),\n                (\n                    \"kind\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=20,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"otype\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=64,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"size\",\n                    lamindb.base.fields.BigIntegerField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"hash\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=22,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"n_files\",\n                    lamindb.base.fields.BigIntegerField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"n_observations\",\n                    lamindb.base.fields.BigIntegerField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"_hash_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"_key_is_virtual\",\n                    lamindb.base.fields.BooleanField(blank=True, default=None),\n                ),\n                (\n                    \"_overwrite_versions\",\n                    lamindb.base.fields.BooleanField(blank=True, default=None),\n                ),\n                (\n                    \"_actions\",\n                    models.ManyToManyField(\n                        related_name=\"_action_targets\", to=\"lamindb.artifact\"\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactArtifact\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_artifact\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"value\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_value\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"artifacts\",\n            field=models.ManyToManyField(\n                related_name=\"linked_by_artifacts\",\n                through=\"lamindb.ArtifactArtifact\",\n                to=\"lamindb.artifact\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"BlockProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"block\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.block\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"block\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_column=\"branch_id\",\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_column=\"branch_id\",\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Collection\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_20,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"key\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=255\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"hash\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=22,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=25,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"_actions\",\n                    models.ManyToManyField(related_name=\"+\", to=\"lamindb.artifact\"),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_column=\"branch_id\",\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"meta_artifact\",\n                    lamindb.base.fields.OneToOneField(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"_meta_of_collection\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n        ),\n        migrations.CreateModel(\n            name=\"CollectionArtifact\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_collection\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"collection\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_artifact\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"artifacts\",\n            field=models.ManyToManyField(\n                related_name=\"collections\",\n                through=\"lamindb.CollectionArtifact\",\n                to=\"lamindb.artifact\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"CollectionProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"collection\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"CollectionReference\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"collection\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_reference\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"Feature\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=150\n                    ),\n                ),\n                (\n                    \"_dtype_str\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"unit\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\"array_rank\", models.SmallIntegerField(db_index=True, default=0)),\n                (\"array_size\", models.IntegerField(db_index=True, default=0)),\n                (\n                    \"array_shape\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"synonyms\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"default_value\",\n                    lamindb.base.fields.JSONField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"nullable\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"coerce\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_column=\"branch_id\",\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"features\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(lamindb.models.can_curate.CanCurate, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"CollectionRecord\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"collection\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_record\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_collectionrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactRun\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_run\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactrun\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactReference\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_reference\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactreference\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactRecord\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_record\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactproject\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"artifactartifact\",\n            name=\"feature\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifactartifact\",\n                to=\"lamindb.feature\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"FeatureProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"JsonValue\",\n            fields=[\n                (\n                    \"id\",\n                    models.BigAutoField(\n                        auto_created=True,\n                        primary_key=True,\n                        serialize=False,\n                        verbose_name=\"ID\",\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"value\", models.JSONField()),\n                (\n                    \"hash\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=22,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_column=\"branch_id\",\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n                \"base_manager_name\": \"objects\",\n            },\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactJsonValue\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_jsonvalue\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"jsonvalue\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifact\",\n                        to=\"lamindb.jsonvalue\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"json_values\",\n            field=models.ManyToManyField(\n                related_name=\"artifacts\",\n                through=\"lamindb.ArtifactJsonValue\",\n                to=\"lamindb.jsonvalue\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Project\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=255\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"abbr\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=32,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"url\",\n                    lamindb.base.fields.URLField(\n                        blank=True, default=None, max_length=255, null=True\n                    ),\n                ),\n                (\n                    \"start_date\",\n                    lamindb.base.fields.DateField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"end_date\",\n                    lamindb.base.fields.DateField(blank=True, default=None, null=True),\n                ),\n                (\"_status_code\", models.SmallIntegerField(db_index=True, default=0)),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"projects\",\n                        through=\"lamindb.ArtifactProject\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"blocks\",\n                    models.ManyToManyField(\n                        related_name=\"projects\",\n                        through=\"lamindb.BlockProject\",\n                        to=\"lamindb.block\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_column=\"branch_id\",\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"collections\",\n                    models.ManyToManyField(\n                        related_name=\"projects\",\n                        through=\"lamindb.CollectionProject\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"features\",\n                    models.ManyToManyField(\n                        related_name=\"projects\",\n                        through=\"lamindb.FeatureProject\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"parents\",\n                    models.ManyToManyField(\n                        related_name=\"children\", to=\"lamindb.project\"\n                    ),\n                ),\n                (\n                    \"predecessors\",\n                    models.ManyToManyField(\n                        related_name=\"successors\", to=\"lamindb.project\"\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"projects\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(\n                lamindb.models.can_curate.CanCurate,\n                models.Model,\n                lamindb.models.sqlrecord.ValidateFields,\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"featureproject\",\n            name=\"project\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_feature\",\n                to=\"lamindb.project\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionproject\",\n            name=\"project\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_collection\",\n                to=\"lamindb.project\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"blockproject\",\n            name=\"project\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_block\",\n                to=\"lamindb.project\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactproject\",\n            name=\"project\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifact\",\n                to=\"lamindb.project\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Record\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=16,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=150,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"reference\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=25,\n                        null=True,\n                    ),\n                ),\n                (\"extra_data\", models.JSONField(null=True)),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"records\",\n                        through=\"lamindb.ArtifactRecord\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_column=\"branch_id\",\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"collections\",\n                    models.ManyToManyField(\n                        related_name=\"records\",\n                        through=\"lamindb.CollectionRecord\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"parents\",\n                    models.ManyToManyField(\n                        related_name=\"children\", to=\"lamindb.record\"\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"records\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(\n                lamindb.models.has_parents.HasParents,\n                lamindb.models.can_curate.CanCurate,\n                models.Model,\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ProjectRecord\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_projectrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_record\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"records\",\n            field=models.ManyToManyField(\n                related_name=\"projects\",\n                through=\"lamindb.ProjectRecord\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionrecord\",\n            name=\"record\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_collection\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactrecord\",\n            name=\"record\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifact\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordArtifact\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordartifact\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_artifact\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"value\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_in_record\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"record\", \"feature\", \"value\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"linked_in_records\",\n            field=models.ManyToManyField(\n                related_name=\"linked_artifacts\",\n                through=\"lamindb.RecordArtifact\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordCollection\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordcollection\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_collection\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"value\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_in_record\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"record\", \"feature\", \"value\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"linked_in_records\",\n            field=models.ManyToManyField(\n                related_name=\"linked_collections\",\n                through=\"lamindb.RecordCollection\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordProject\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordproject\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_project\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"value\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_in_record\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"record\", \"feature\", \"value\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"linked_in_records\",\n            field=models.ManyToManyField(\n                related_name=\"linked_projects\",\n                through=\"lamindb.RecordProject\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordRecord\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_record\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"value\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_record\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"record\", \"feature\", \"value\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"linked_records\",\n            field=models.ManyToManyField(\n                related_name=\"linked_in_records\",\n                through=\"lamindb.RecordRecord\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordReference\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordreference\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_reference\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.CreateModel(\n            name=\"RecordRun\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordrun\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_run\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.CreateModel(\n            name=\"RecordTransform\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordtransform\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_transform\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.CreateModel(\n            name=\"RecordULabel\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordulabel\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_ulabel\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.CreateModel(\n            name=\"RecordUser\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recorduser\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_user\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.CreateModel(\n            name=\"Reference\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=255\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"abbr\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=32,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"url\",\n                    lamindb.base.fields.URLField(blank=True, db_index=True, null=True),\n                ),\n                (\n                    \"pubmed_id\",\n                    lamindb.base.fields.BigIntegerField(\n                        blank=True, db_index=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"doi\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                        validators=[\n                            django.core.validators.RegexValidator(\n                                message=\"Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)\",\n                                regex=\"^(?:https?://(?:dx\\\\.)?doi\\\\.org/|doi:|DOI:)?10\\\\.\\\\d+/.*$\",\n                            )\n                        ],\n                    ),\n                ),\n                (\n                    \"text\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"date\",\n                    lamindb.base.fields.DateField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"references\",\n                        through=\"lamindb.ArtifactReference\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_column=\"branch_id\",\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"collections\",\n                    models.ManyToManyField(\n                        related_name=\"references\",\n                        through=\"lamindb.CollectionReference\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"linked_in_records\",\n                    models.ManyToManyField(\n                        related_name=\"linked_references\",\n                        through=\"lamindb.RecordReference\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"references\",\n                        to=\"lamindb.reference\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(\n                lamindb.models.can_curate.CanCurate,\n                models.Model,\n                lamindb.models.sqlrecord.ValidateFields,\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recordreference\",\n            name=\"value\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_in_record\",\n                to=\"lamindb.reference\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"references\",\n            field=models.ManyToManyField(\n                related_name=\"projects\", to=\"lamindb.reference\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionreference\",\n            name=\"reference\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_collection\",\n                to=\"lamindb.reference\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactreference\",\n            name=\"reference\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifact\",\n                to=\"lamindb.reference\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ReferenceRecord\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_referencerecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_reference\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"reference\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_record\",\n                        to=\"lamindb.reference\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"reference\",\n            name=\"records\",\n            field=models.ManyToManyField(\n                related_name=\"references\",\n                through=\"lamindb.ReferenceRecord\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Run\",\n            fields=[\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=150,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"entrypoint\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"started_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"finished_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True, db_index=True, default=None, null=True\n                    ),\n                ),\n                (\"params\", models.JSONField(null=True)),\n                (\n                    \"reference\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=25,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"cli_args\",\n                    lamindb.base.fields.CharField(\n                        blank=True, default=None, max_length=1024, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_is_consecutive\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(\n                        db_default=-3, db_index=True, default=-3, null=True\n                    ),\n                ),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"runs\",\n                        through=\"lamindb.ArtifactRun\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_column=\"branch_id\",\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"environment\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"_environment_of\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"initiated_by_run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"initiated_runs\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"linked_in_records\",\n                    models.ManyToManyField(\n                        related_name=\"linked_runs\",\n                        through=\"lamindb.RecordRun\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"report\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"_report_of\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"referencerecord\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"reference\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recordrun\",\n            name=\"value\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_in_record\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"input_of_runs\",\n            field=models.ManyToManyField(\n                related_name=\"input_records\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                editable=False,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"output_records\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"projectrecord\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"jsonvalue\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"featureproject\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"feature\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionreference\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionrecord\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionproject\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionartifact\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"input_of_runs\",\n            field=models.ManyToManyField(\n                related_name=\"input_collections\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"recreating_runs\",\n            field=models.ManyToManyField(\n                related_name=\"recreated_collections\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"output_collections\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"blockproject\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactUser\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_user\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactuser\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactULabel\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactulabel\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"artifactrun\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"links_artifact\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactreference\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactrecord\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactproject\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactjsonvalue\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactartifact\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"input_of_runs\",\n            field=models.ManyToManyField(\n                related_name=\"input_artifacts\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"recreating_runs\",\n            field=models.ManyToManyField(\n                related_name=\"recreated_artifacts\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                editable=False,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"output_artifacts\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunJsonValue\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"jsonvalue\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_run\",\n                        to=\"lamindb.jsonvalue\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_jsonvalue\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"json_values\",\n            field=models.ManyToManyField(\n                related_name=\"runs\",\n                through=\"lamindb.RunJsonValue\",\n                to=\"lamindb.jsonvalue\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunProject\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_run\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"runs\",\n            field=models.ManyToManyField(\n                related_name=\"projects\", through=\"lamindb.RunProject\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunRecord\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_runrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_run\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_record\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"runs\",\n            field=models.ManyToManyField(\n                related_name=\"records\", through=\"lamindb.RunRecord\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Schema\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=16,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=150,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"n_members\",\n                    lamindb.base.fields.IntegerField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"coerce\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"flexible\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"itype\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=120,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"otype\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=64,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"_dtype_str\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        default=None,\n                        editable=False,\n                        max_length=64,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"hash\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=22,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"minimal_set\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True, editable=False\n                    ),\n                ),\n                (\n                    \"ordered_set\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=False, editable=False\n                    ),\n                ),\n                (\n                    \"maximal_set\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=False, editable=False\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_column=\"branch_id\",\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"schemas\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(lamindb.models.can_curate.CanCurate, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"schema\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                null=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"records\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactSchema\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"slot\",\n                    lamindb.base.fields.CharField(\n                        blank=True, default=None, max_length=255, null=True\n                    ),\n                ),\n                (\n                    \"feature_ref_is_semantic\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"_links_schema\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"schema\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"_links_artifact\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"schema\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"validated_artifacts\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"schemas\",\n            field=models.ManyToManyField(\n                related_name=\"artifacts\",\n                through=\"lamindb.ArtifactSchema\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"SchemaComponent\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"slot\",\n                    lamindb.base.fields.CharField(\n                        blank=True, default=None, max_length=255, null=True\n                    ),\n                ),\n                (\n                    \"component\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_composite\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n                (\n                    \"composite\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_component\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"schema\",\n            name=\"components\",\n            field=models.ManyToManyField(\n                related_name=\"composites\",\n                through=\"lamindb.SchemaComponent\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"SchemaFeature\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_schema\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"schema\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_feature\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"schema\", \"feature\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"feature\",\n            name=\"schemas\",\n            field=models.ManyToManyField(\n                related_name=\"features\",\n                through=\"lamindb.SchemaFeature\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"SchemaProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_schema\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"schema\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"schemas\",\n            field=models.ManyToManyField(\n                related_name=\"projects\",\n                through=\"lamindb.SchemaProject\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"schema\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"reference\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"jsonvalue\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"feature\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"block\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Storage\",\n            fields=[\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"root\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=30\n                    ),\n                ),\n                (\n                    \"region\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=64,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"instance_uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=12,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_column=\"branch_id\",\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"space\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.space\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"storage\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"artifacts\",\n                to=\"lamindb.storage\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Transform\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=16,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"key\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=1024\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(\n                        blank=True, db_index=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"kind\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=\"pipeline\", max_length=20\n                    ),\n                ),\n                (\n                    \"source_code\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"hash\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=22,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=25,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_column=\"branch_id\",\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"environment\",\n                    models.ForeignKey(\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"_environment_of_transforms\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"linked_in_records\",\n                    models.ManyToManyField(\n                        related_name=\"linked_transforms\",\n                        through=\"lamindb.RecordTransform\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"space\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.space\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"transform\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"runs\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recordtransform\",\n            name=\"value\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_in_record\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_transform\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"transform\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"transforms\",\n            field=models.ManyToManyField(\n                related_name=\"projects\",\n                through=\"lamindb.TransformProject\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformRecord\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_transformrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_transform\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"transform\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_record\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"transforms\",\n            field=models.ManyToManyField(\n                related_name=\"records\",\n                through=\"lamindb.TransformRecord\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformReference\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"reference\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_transform\",\n                        to=\"lamindb.reference\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"transform\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_reference\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"reference\",\n            name=\"transforms\",\n            field=models.ManyToManyField(\n                related_name=\"references\",\n                through=\"lamindb.TransformReference\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformTransform\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\"config\", models.JSONField(default=None, null=True)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"predecessor\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_successor\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n                (\n                    \"successor\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_predecessor\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"transform\",\n            name=\"predecessors\",\n            field=models.ManyToManyField(\n                related_name=\"successors\",\n                through=\"lamindb.TransformTransform\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ULabel\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_8,\n                        editable=False,\n                        max_length=8,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=150\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"reference\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=25,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"ulabels\",\n                        through=\"lamindb.ArtifactULabel\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_column=\"branch_id\",\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"linked_in_records\",\n                    models.ManyToManyField(\n                        related_name=\"linked_ulabels\",\n                        through=\"lamindb.RecordULabel\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"parents\",\n                    models.ManyToManyField(\n                        related_name=\"children\", to=\"lamindb.ulabel\"\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"space\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.space\",\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"ulabels\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(\n                lamindb.models.has_parents.HasParents,\n                lamindb.models.can_curate.CanCurate,\n                models.Model,\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformULabel\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"transform\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n                (\n                    \"ulabel\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_transform\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"transform\",\n            name=\"ulabels\",\n            field=models.ManyToManyField(\n                related_name=\"transforms\",\n                through=\"lamindb.TransformULabel\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunULabel\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"ulabel\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_run\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"ulabels\",\n            field=models.ManyToManyField(\n                related_name=\"runs\", through=\"lamindb.RunULabel\", to=\"lamindb.ulabel\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recordulabel\",\n            name=\"value\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_record\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"CollectionULabel\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"collection\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_collectionulabel\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"ulabel\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_collection\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"ulabels\",\n            field=models.ManyToManyField(\n                related_name=\"collections\",\n                through=\"lamindb.CollectionULabel\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactulabel\",\n            name=\"ulabel\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifact\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ULabelProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"ulabel\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"ulabels\",\n            field=models.ManyToManyField(\n                related_name=\"projects\",\n                through=\"lamindb.ULabelProject\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"User\",\n            fields=[\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=8,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"handle\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=150,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"users\",\n                        through=\"lamindb.ArtifactUser\",\n                        through_fields=(\"user\", \"artifact\"),\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"linked_in_records\",\n                    models.ManyToManyField(\n                        related_name=\"linked_users\",\n                        through=\"lamindb.RecordUser\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.can_curate.CanCurate),\n        ),\n        migrations.AddField(\n            model_name=\"ulabelproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ULabelBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"ulabel\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"ulabel\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformulabel\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformtransform\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformreference\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformrecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\"line_number\", models.IntegerField(null=True)),\n                (\n                    \"transform\",\n                    models.ForeignKey(\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"transform\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"created_transforms\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"storage\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"SpaceBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"space\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.space\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"space\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"schemaproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"schemacomponent\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"SchemaBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"schema\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"schema\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"runulabel\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"runrecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"runproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"runjsonvalue\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"run\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"created_runs\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"referencerecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"reference\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recorduser\",\n            name=\"value\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_record\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"record\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"projectrecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ProjectBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"project\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"jsonvalue\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"featureproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"FeatureBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"feature\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"feature\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionulabel\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionreference\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionrecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"CollectionBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"collection\",\n                    models.ForeignKey(\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"collectionartifact\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"BranchBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"blockproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"block\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactuser\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactuser\",\n            name=\"user\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifact\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactulabel\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactschema\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactrun\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactreference\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactrecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactjsonvalue\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"mdpage\",\n                        db_index=True,\n                        default=\"mdpage\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"artifact\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"artifactartifact\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"created_artifacts\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordJson\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"value\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordjson\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_json\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"record\", \"feature\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"recordreference\",\n            unique_together={(\"record\", \"feature\", \"value\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"recordrun\",\n            unique_together={(\"record\", \"feature\", \"value\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"recordtransform\",\n            unique_together={(\"record\", \"feature\", \"value\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"recordulabel\",\n            unique_together={(\"record\", \"feature\", \"value\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"ulabelproject\",\n            unique_together={(\"ulabel\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transformulabel\",\n            unique_together={(\"transform\", \"ulabel\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transformtransform\",\n            unique_together={(\"successor\", \"predecessor\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transformreference\",\n            unique_together={(\"transform\", \"reference\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transformrecord\",\n            unique_together={(\"transform\", \"record\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transformproject\",\n            unique_together={(\"transform\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transform\",\n            unique_together={(\"key\", \"hash\")},\n        ),\n        migrations.AddConstraint(\n            model_name=\"space\",\n            constraint=models.UniqueConstraint(\n                django.db.models.functions.text.Lower(\"name\"),\n                name=\"unique_space_name_lower\",\n            ),\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"schemaproject\",\n            unique_together={(\"schema\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"schemacomponent\",\n            unique_together={(\"composite\", \"slot\"), (\"composite\", \"slot\", \"component\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"runulabel\",\n            unique_together={(\"run\", \"ulabel\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"runrecord\",\n            unique_together={(\"run\", \"record\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"runproject\",\n            unique_together={(\"run\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"runjsonvalue\",\n            unique_together={(\"run\", \"jsonvalue\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"referencerecord\",\n            unique_together={(\"reference\", \"feature\", \"record\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"recorduser\",\n            unique_together={(\"record\", \"feature\", \"value\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"projectrecord\",\n            unique_together={(\"project\", \"feature\", \"record\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"jsonvalue\",\n            unique_together={(\"feature\", \"hash\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"featureproject\",\n            unique_together={(\"feature\", \"project\")},\n        ),\n        migrations.AddConstraint(\n            model_name=\"feature\",\n            constraint=models.CheckConstraint(\n                condition=models.Q(\n                    (\"is_type\", True), (\"_dtype_str__isnull\", False), _connector=\"OR\"\n                ),\n                name=\"feature_dtype_str_not_null_when_is_type_false\",\n            ),\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"collectionulabel\",\n            unique_together={(\"collection\", \"ulabel\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"collectionreference\",\n            unique_together={(\"collection\", \"reference\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"collectionrecord\",\n            unique_together={(\"collection\", \"record\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"collectionproject\",\n            unique_together={(\"collection\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"collectionartifact\",\n            unique_together={(\"collection\", \"artifact\")},\n        ),\n        migrations.AddConstraint(\n            model_name=\"collection\",\n            constraint=models.UniqueConstraint(\n                fields=(\"key\", \"hash\"), name=\"unique_collection_key_hash_not_null\"\n            ),\n        ),\n        migrations.AddConstraint(\n            model_name=\"branch\",\n            constraint=models.UniqueConstraint(\n                django.db.models.functions.text.Lower(\"name\"),\n                name=\"unique_branch_name_lower\",\n            ),\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"blockproject\",\n            unique_together={(\"block\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactuser\",\n            unique_together={(\"artifact\", \"user\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactulabel\",\n            unique_together={(\"artifact\", \"ulabel\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactschema\",\n            unique_together={(\"artifact\", \"schema\"), (\"artifact\", \"slot\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactrun\",\n            unique_together={(\"artifact\", \"run\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactreference\",\n            unique_together={(\"artifact\", \"reference\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactrecord\",\n            unique_together={(\"artifact\", \"record\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactproject\",\n            unique_together={(\"artifact\", \"project\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactjsonvalue\",\n            unique_together={(\"artifact\", \"jsonvalue\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactartifact\",\n            unique_together={(\"artifact\", \"value\", \"feature\")},\n        ),\n        migrations.AddConstraint(\n            model_name=\"artifact\",\n            constraint=models.UniqueConstraint(\n                condition=models.Q((\"key__isnull\", False)),\n                fields=(\"storage\", \"key\", \"hash\"),\n                name=\"unique_artifact_storage_key_hash_not_null\",\n            ),\n        ),\n        migrations.AddConstraint(\n            model_name=\"artifact\",\n            constraint=models.UniqueConstraint(\n                condition=models.Q((\"key__isnull\", True)),\n                fields=(\"storage\", \"hash\"),\n                name=\"unique_artifact_storage_hash_null_key\",\n            ),\n        ),\n        migrations.RunPython(apply_constraints),\n    ]\n\n\nif connection.vendor == \"postgresql\":\n    Migration.operations += [\n        pgtrigger.migrations.AddTrigger(\n            model_name=\"ulabel\",\n            trigger=pgtrigger.compiler.Trigger(\n                name=\"prevent_ulabel_type_cycle\",\n                sql=pgtrigger.compiler.UpsertTriggerSql(\n                    condition=\"WHEN (NEW.type_id IS NOT NULL)\",\n                    func=\"\\n                        -- Check for direct self-reference\\n                        IF NEW.type_id = NEW.id THEN\\n                            RAISE EXCEPTION 'Cannot set type: ulabel cannot be its own type';\\n                        END IF;\\n\\n                        -- Check for cycles in the type chain\\n                        IF EXISTS (\\n                            WITH RECURSIVE type_chain AS (\\n                                SELECT type_id, 1 as depth\\n                                FROM lamindb_ulabel\\n                                WHERE id = NEW.type_id\\n\\n                                UNION ALL\\n\\n                                SELECT r.type_id, tc.depth + 1\\n                                FROM lamindb_ulabel r\\n                                INNER JOIN type_chain tc ON r.id = tc.type_id\\n                                WHERE tc.depth < 100\\n                            )\\n                            SELECT 1 FROM type_chain WHERE type_id = NEW.id\\n                        ) THEN\\n                            RAISE EXCEPTION 'Cannot set type: would create a cycle';\\n                        END IF;\\n\\n                        RETURN NEW;\\n                    \",\n                    hash=\"53487a8e36a64748418457f7229de6d5cf31e6bd\",\n                    operation=\"UPDATE OR INSERT\",\n                    pgid=\"pgtrigger_prevent_ulabel_type_cycle_863ae\",\n                    table=\"lamindb_ulabel\",\n                    when=\"BEFORE\",\n                ),\n            ),\n        ),\n        pgtrigger.migrations.AddTrigger(\n            model_name=\"record\",\n            trigger=pgtrigger.compiler.Trigger(\n                name=\"prevent_record_type_cycle\",\n                sql=pgtrigger.compiler.UpsertTriggerSql(\n                    condition=\"WHEN (NEW.type_id IS NOT NULL)\",\n                    func=\"\\n                        -- Check for direct self-reference\\n                        IF NEW.type_id = NEW.id THEN\\n                            RAISE EXCEPTION 'Cannot set type: record cannot be its own type';\\n                        END IF;\\n\\n                        -- Check for cycles in the type chain\\n                        IF EXISTS (\\n                            WITH RECURSIVE type_chain AS (\\n                                SELECT type_id, 1 as depth\\n                                FROM lamindb_record\\n                                WHERE id = NEW.type_id\\n\\n                                UNION ALL\\n\\n                                SELECT r.type_id, tc.depth + 1\\n                                FROM lamindb_record r\\n                                INNER JOIN type_chain tc ON r.id = tc.type_id\\n                                WHERE tc.depth < 100\\n                            )\\n                            SELECT 1 FROM type_chain WHERE type_id = NEW.id\\n                        ) THEN\\n                            RAISE EXCEPTION 'Cannot set type: would create a cycle';\\n                        END IF;\\n\\n                        RETURN NEW;\\n                    \",\n                    hash=\"deaab832a066dfec76228f5b7a62a08f334876a9\",\n                    operation=\"UPDATE OR INSERT\",\n                    pgid=\"pgtrigger_prevent_record_type_cycle_56c18\",\n                    table=\"lamindb_record\",\n                    when=\"BEFORE\",\n                ),\n            ),\n        ),\n        pgtrigger.migrations.AddTrigger(\n            model_name=\"feature\",\n            trigger=pgtrigger.compiler.Trigger(\n                name=\"update_feature_on_name_change\",\n                sql=pgtrigger.compiler.UpsertTriggerSql(\n                    condition=\"WHEN (OLD.name IS DISTINCT FROM NEW.name)\",\n                    func=\"DECLARE\\n    old_renamed JSONB;\\n    new_renamed JSONB;\\n    ts TEXT;\\nBEGIN\\n    -- Only proceed if name actually changed\\n    IF OLD.name IS DISTINCT FROM NEW.name THEN\\n        -- Update synonyms\\n        IF NEW.synonyms IS NULL OR NEW.synonyms = '' THEN\\n            NEW.synonyms := OLD.name;\\n        ELSIF position(OLD.name in NEW.synonyms) = 0 THEN\\n            NEW.synonyms := NEW.synonyms || '|' || OLD.name;\\n        END IF;\\n\\n        -- Update _aux with rename history\\n        ts := TO_CHAR(NOW() AT TIME ZONE 'UTC', 'YYYY-MM-DD\\\"T\\\"HH24:MI:SS\\\"Z\\\"');\\n\\n        -- Get existing renamed history or initialize empty object\\n        old_renamed := COALESCE((OLD._aux->>'renamed')::JSONB, '{}'::JSONB);\\n\\n        -- Add old name with timestamp\\n        new_renamed := old_renamed || jsonb_build_object(ts, OLD.name);\\n\\n        -- Update _aux with new renamed history\\n        IF NEW._aux IS NULL THEN\\n            NEW._aux := jsonb_build_object('renamed', new_renamed);\\n        ELSE\\n            NEW._aux := NEW._aux || jsonb_build_object('renamed', new_renamed);\\n        END IF;\\n    END IF;\\n\\n    RETURN NEW;\\nEND;\\n\",\n                    hash=\"5f2e7a65e42c34b0455f0840def52f078726e401\",\n                    operation=\"UPDATE\",\n                    pgid=\"pgtrigger_update_feature_on_name_change_6c32d\",\n                    table=\"lamindb_feature\",\n                    when=\"BEFORE\",\n                ),\n            ),\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/0178_v2_2.py",
    "content": "# Generated by Django 5.2 on 2026-02-15 11:25\n\nimport django.db.models.deletion\nfrom django.db import migrations, models\n\nimport lamindb.base.fields\nimport lamindb.models.sqlrecord\n\n\nclass Migration(migrations.Migration):\n    dependencies = [\n        (\"lamindb\", \"0177_squashed\"),\n    ]\n\n    operations = [\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"block\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"feature\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"jsonvalue\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"reference\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"description\",\n            field=lamindb.base.fields.TextField(blank=True, default=None, null=True),\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"plan\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"_plan_for_runs\",\n                to=\"lamindb.artifact\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"schema\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"storage\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transform\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transform\",\n            name=\"plan\",\n            field=models.ForeignKey(\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"_plan_for_transforms\",\n                to=\"lamindb.artifact\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"ulabel\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"artifact\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"block\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"collection\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"feature\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"jsonvalue\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"project\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"record\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"reference\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"run\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"schema\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"storage\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"transform\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"ulabel\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"BranchPlan\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_branchplan\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_branchplan\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"branch\", \"artifact\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"plans\",\n            field=models.ManyToManyField(\n                related_name=\"_plan_for_branches\",\n                through=\"lamindb.BranchPlan\",\n                to=\"lamindb.artifact\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"BranchProject\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_branch\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"branch\", \"project\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"projects\",\n            field=models.ManyToManyField(\n                related_name=\"branches\",\n                through=\"lamindb.BranchProject\",\n                to=\"lamindb.project\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"BranchULabel\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"ulabel\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_branch\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"branch\", \"ulabel\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"ulabels\",\n            field=models.ManyToManyField(\n                related_name=\"branches\",\n                through=\"lamindb.BranchULabel\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"BranchUser\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"role\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=32\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_user\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"user\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_branch\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"branch\", \"user\", \"role\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"users\",\n            field=models.ManyToManyField(\n                related_name=\"branches\", through=\"lamindb.BranchUser\", to=\"lamindb.user\"\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ProjectUser\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"role\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=32\n                    ),\n                ),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_user\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"user\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_project\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"project\", \"user\", \"role\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"users\",\n            field=models.ManyToManyField(\n                related_name=\"projects\",\n                through=\"lamindb.ProjectUser\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunArtifact\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_runartifact\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_runartifact\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_runartifact\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"run\", \"artifact\", \"feature\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"linked_artifacts\",\n            field=models.ManyToManyField(\n                related_name=\"linked_by_runs\",\n                through=\"lamindb.RunArtifact\",\n                to=\"lamindb.artifact\",\n            ),\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/0179_v2_2_part_2.py",
    "content": "# Generated by Django 5.2 on 2026-02-15 14:12\n\nimport django.db.models.deletion\nfrom django.db import migrations, models\n\nimport lamindb.base.fields\nimport lamindb.models.sqlrecord\n\n\nclass Migration(migrations.Migration):\n    dependencies = [\n        (\"lamindb\", \"0178_v2_2\"),\n    ]\n\n    operations = [\n        migrations.RemoveField(\n            model_name=\"branch\",\n            name=\"plans\",\n        ),\n        migrations.CreateModel(\n            name=\"BranchArtifact\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_branch\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_artifact\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"branch\", \"artifact\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"artifacts\",\n            field=models.ManyToManyField(\n                related_name=\"linked_by_branches\",\n                through=\"lamindb.BranchArtifact\",\n                to=\"lamindb.artifact\",\n            ),\n        ),\n        migrations.DeleteModel(\n            name=\"BranchPlan\",\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/0180_v2_2_part_3.py",
    "content": "# Generated by Django 5.2 on 2026-02-15 14:29\n\nimport django.db.models.deletion\nfrom django.db import migrations\n\nimport lamindb.base.fields\n\n\nclass Migration(migrations.Migration):\n    dependencies = [\n        (\"lamindb\", \"0179_v2_2_part_2\"),\n    ]\n\n    operations = [\n        migrations.AlterField(\n            model_name=\"runartifact\",\n            name=\"artifact\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_in_run\",\n                to=\"lamindb.artifact\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"runartifact\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"values_artifact\",\n                to=\"lamindb.run\",\n            ),\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/0181_v2_2_part_4.py",
    "content": "# Generated by Django 5.2 on 2026-02-15 15:43\n\nimport django.db.models.deletion\nfrom django.db import migrations, models\n\n\nclass Migration(migrations.Migration):\n    dependencies = [\n        (\"lamindb\", \"0180_v2_2_part_3\"),\n    ]\n\n    operations = [\n        migrations.AddField(\n            model_name=\"block\",\n            name=\"anchor\",\n            field=models.ForeignKey(\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"children\",\n                to=\"lamindb.block\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"block\",\n            name=\"key\",\n            field=models.CharField(db_index=True, max_length=1024, null=True),\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/0182_v2_2_part_5.py",
    "content": "# Generated by Django 5.2 on 2026-02-17 16:33\n\nimport django.db.models.deletion\nfrom django.db import migrations, models\n\nimport lamindb.base.fields\nimport lamindb.base.users\n\n\nclass Migration(migrations.Migration):\n    dependencies = [\n        (\"lamindb\", \"0181_v2_2_part_4\"),\n    ]\n\n    operations = [\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_index=True, default=0),\n        ),\n        migrations.AlterField(\n            model_name=\"artifactblock\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"artifactblock\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"block\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"block\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"branch\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"branchblock\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"branchblock\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"collectionblock\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"collectionblock\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"featureblock\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"featureblock\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"projectblock\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"projectblock\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"recordblock\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"recordblock\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"runblock\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"runblock\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"schemablock\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"schemablock\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"spaceblock\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"spaceblock\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"transformblock\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"transformblock\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"ulabelblock\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"ulabelblock\",\n            name=\"kind\",\n            field=models.CharField(\n                db_default=\"readme\", db_index=True, default=\"readme\", max_length=22\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactblock\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AddField(\n            model_name=\"block\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AddField(\n            model_name=\"branchblock\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AddField(\n            model_name=\"collectionblock\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AddField(\n            model_name=\"featureblock\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AddField(\n            model_name=\"projectblock\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AddField(\n            model_name=\"recordblock\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AddField(\n            model_name=\"runblock\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AddField(\n            model_name=\"schemablock\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AddField(\n            model_name=\"spaceblock\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AddField(\n            model_name=\"transformblock\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AddField(\n            model_name=\"ulabelblock\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AlterField(\n            model_name=\"branch\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AlterField(\n            model_name=\"project\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),\n        ),\n        migrations.AlterField(\n            model_name=\"run\",\n            name=\"_status_code\",\n            field=models.SmallIntegerField(db_default=-3, db_index=True, default=-3),\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/0183_v2_2_part_6.py",
    "content": "# Generated by Django 5.2 on 2026-02-17 23:04\n\nfrom django.db import migrations\n\n\nclass Migration(migrations.Migration):\n    dependencies = [\n        (\"lamindb\", \"0182_v2_2_part_5\"),\n    ]\n\n    operations = [\n        migrations.RemoveField(\n            model_name=\"branch\",\n            name=\"artifacts\",\n        ),\n        migrations.DeleteModel(\n            name=\"BranchArtifact\",\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/0184_alter_transformrecord_feature.py",
    "content": "# Generated by Django 5.2 on 2026-03-07 12:16\n\nimport django.db.models.deletion\nfrom django.db import migrations\n\nimport lamindb.base.fields\n\n\nclass Migration(migrations.Migration):\n    dependencies = [\n        (\"lamindb\", \"0183_v2_2_part_6\"),\n    ]\n\n    operations = [\n        migrations.AlterField(\n            model_name=\"transformrecord\",\n            name=\"feature\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_transformrecord\",\n                to=\"lamindb.feature\",\n            ),\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/0185_alter_runrecord_feature.py",
    "content": "# Generated by Django 5.2 on 2026-04-05 14:32\n\nimport django.db.models.deletion\nfrom django.db import migrations\n\nimport lamindb.base.fields\n\n\nclass Migration(migrations.Migration):\n    dependencies = [\n        (\"lamindb\", \"0184_alter_transformrecord_feature\"),\n    ]\n\n    operations = [\n        migrations.AlterField(\n            model_name=\"runrecord\",\n            name=\"feature\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_runrecord\",\n                to=\"lamindb.feature\",\n            ),\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/0186_v2_4.py",
    "content": "# Generated by Django 5.2 on 2026-04-12 18:49\n\nimport django.db.models.deletion\nfrom django.db import migrations, models\n\nimport lamindb.base.fields\n\n\nclass Migration(migrations.Migration):\n    dependencies = [\n        (\"lamindb\", \"0185_alter_runrecord_feature\"),\n    ]\n\n    operations = [\n        migrations.AddField(\n            model_name=\"artifactblock\",\n            name=\"branch\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactblock\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"branchblock\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionblock\",\n            name=\"branch\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionblock\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"featureblock\",\n            name=\"branch\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"featureblock\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"projectblock\",\n            name=\"branch\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"projectblock\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recordblock\",\n            name=\"branch\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recordblock\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"runblock\",\n            name=\"branch\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"runblock\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"schemablock\",\n            name=\"branch\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"schemablock\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"spaceblock\",\n            name=\"branch\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"spaceblock\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformblock\",\n            name=\"branch\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformblock\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"ulabelblock\",\n            name=\"branch\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"ulabelblock\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"block\",\n            name=\"branch\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"block\",\n            name=\"created_on\",\n            field=models.ForeignKey(\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"_aux\",\n            field=lamindb.base.fields.JSONField(\n                blank=True, db_default=None, default=None, null=True\n            ),\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/0187_squashed.py",
    "content": "# Generated by Django 5.2 on 2026-04-16 06:44\n\nimport django.core.validators\nimport django.db.models.deletion\nimport django.db.models.functions.datetime\nimport django.db.models.functions.text\nimport pgtrigger.compiler\nimport pgtrigger.migrations\nfrom django.db import connection, migrations, models\n\nimport lamindb.base.fields\nimport lamindb.base.uids\nimport lamindb.base.users\nimport lamindb.models.can_curate\nimport lamindb.models.has_parents\nimport lamindb.models.run\nimport lamindb.models.sqlrecord\n\nCREATE_IS_VALID_RECORD_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_record_type(record_type_id INTEGER, record_is_type BOOLEAN)\nRETURNS BOOLEAN AS $$\nBEGIN\n    -- Record with no type is valid\n    IF record_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    -- If current record is a type, it can only reference schema-less types\n    IF record_is_type THEN\n        RETURN EXISTS (\n            SELECT 1 FROM lamindb_record r\n            WHERE r.id = record_type_id AND r.is_type AND r.schema_id IS NULL\n        );\n    END IF;\n\n    -- Regular records can reference any type\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_record r\n        WHERE r.id = record_type_id AND r.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_RECORD_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_record\nADD CONSTRAINT record_type_is_valid_fk\nCHECK (is_valid_record_type(type_id, is_type));\n\"\"\"\n\n\nCREATE_IS_VALID_FEATURE_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_feature_type(feature_type_id INTEGER)\nRETURNS BOOLEAN AS $$\nBEGIN\n    -- Feature with no type is valid\n    IF feature_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    -- Type must have is_type = TRUE\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_feature f\n        WHERE f.id = feature_type_id AND f.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_feature\nADD CONSTRAINT feature_type_is_valid_fk\nCHECK (is_valid_feature_type(type_id));\n\"\"\"\n\n\nCREATE_IS_VALID_SCHEMA_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_schema_type(schema_type_id INTEGER)\nRETURNS BOOLEAN AS $$\nBEGIN\n    IF schema_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_schema s\n        WHERE s.id = schema_type_id AND s.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_schema\nADD CONSTRAINT schema_type_is_valid_fk\nCHECK (is_valid_schema_type(type_id));\n\"\"\"\n\n\nCREATE_IS_VALID_PROJECT_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_project_type(project_type_id INTEGER)\nRETURNS BOOLEAN AS $$\nBEGIN\n    IF project_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_project p\n        WHERE p.id = project_type_id AND p.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_project\nADD CONSTRAINT project_type_is_valid_fk\nCHECK (is_valid_project_type(type_id));\n\"\"\"\n\n\nCREATE_IS_VALID_REFERENCE_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_reference_type(reference_type_id INTEGER)\nRETURNS BOOLEAN AS $$\nBEGIN\n    IF reference_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_reference r\n        WHERE r.id = reference_type_id AND r.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_reference\nADD CONSTRAINT reference_type_is_valid_fk\nCHECK (is_valid_reference_type(type_id));\n\"\"\"\n\n\nCREATE_IS_VALID_ULABEL_TYPE_FUNCTION = \"\"\"\nCREATE OR REPLACE FUNCTION is_valid_ulabel_type(ulabel_type_id INTEGER)\nRETURNS BOOLEAN AS $$\nBEGIN\n    IF ulabel_type_id IS NULL THEN\n        RETURN TRUE;\n    END IF;\n\n    RETURN EXISTS (\n        SELECT 1 FROM lamindb_ulabel u\n        WHERE u.id = ulabel_type_id AND u.is_type\n    );\nEND;\n$$ LANGUAGE plpgsql;\n\"\"\"\n\nCREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT = \"\"\"\nALTER TABLE lamindb_ulabel\nADD CONSTRAINT ulabel_type_is_valid_fk\nCHECK (is_valid_ulabel_type(type_id));\n\"\"\"\n\n\ndef apply_constraints(apps, schema_editor):\n    if schema_editor.connection.vendor == \"postgresql\":\n        schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_CONSTRAINT)\n        schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT)\n        schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT)\n        schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT)\n        schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT)\n        schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_FUNCTION)\n        schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT)\n\n\nclass Migration(migrations.Migration):\n    replaces = [\n        (\"lamindb\", \"0177_squashed\"),\n        (\"lamindb\", \"0177_alter_artifactblock_artifact_and_more\"),\n        (\"lamindb\", \"0178_v2_2\"),\n        (\"lamindb\", \"0179_v2_2_part_2\"),\n        (\"lamindb\", \"0180_v2_2_part_3\"),\n        (\"lamindb\", \"0181_v2_2_part_4\"),\n        (\"lamindb\", \"0182_v2_2_part_5\"),\n        (\"lamindb\", \"0183_v2_2_part_6\"),\n        (\"lamindb\", \"0184_alter_transformrecord_feature\"),\n        (\"lamindb\", \"0185_alter_runrecord_feature\"),\n        (\"lamindb\", \"0186_v2_4\"),\n        (\"lamindb\", \"0187_v2_4_part_2\"),\n    ]\n\n    dependencies = []  # type: ignore\n\n    operations = [\n        migrations.CreateModel(\n            name=\"Migration\",\n            fields=[\n                (\n                    \"id\",\n                    models.BigAutoField(\n                        auto_created=True,\n                        primary_key=True,\n                        serialize=False,\n                        verbose_name=\"ID\",\n                    ),\n                ),\n                (\n                    \"app\",\n                    lamindb.base.fields.CharField(\n                        blank=True, default=None, max_length=255\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True, default=None, max_length=255\n                    ),\n                ),\n                (\"applied\", lamindb.base.fields.DateTimeField(blank=True)),\n            ],\n            options={\n                \"db_table\": \"django_migrations\",\n                \"managed\": False,\n            },\n        ),\n        migrations.CreateModel(\n            name=\"Branch\",\n            fields=[\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\"name\", models.CharField(db_index=True, max_length=100)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n            ],\n        ),\n        migrations.CreateModel(\n            name=\"Space\",\n            fields=[\n                (\"id\", models.SmallAutoField(primary_key=True, serialize=False)),\n                (\"name\", models.CharField(db_index=True, max_length=100)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n            ],\n        ),\n        migrations.CreateModel(\n            name=\"Artifact\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"key\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=1024,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"_real_key\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=1024,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(\n                        blank=True, db_index=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"suffix\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=30,\n                    ),\n                ),\n                (\n                    \"kind\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=20,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"otype\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=64,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"size\",\n                    lamindb.base.fields.BigIntegerField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"hash\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=22,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"n_files\",\n                    lamindb.base.fields.BigIntegerField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"n_observations\",\n                    lamindb.base.fields.BigIntegerField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"_hash_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"_key_is_virtual\",\n                    lamindb.base.fields.BooleanField(blank=True, default=None),\n                ),\n                (\n                    \"_overwrite_versions\",\n                    lamindb.base.fields.BooleanField(blank=True, default=None),\n                ),\n                (\n                    \"_actions\",\n                    models.ManyToManyField(\n                        related_name=\"_action_targets\", to=\"lamindb.artifact\"\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactArtifact\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_artifact\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"value\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_value\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"artifacts\",\n            field=models.ManyToManyField(\n                related_name=\"linked_by_artifacts\",\n                through=\"lamindb.ArtifactArtifact\",\n                to=\"lamindb.artifact\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Block\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\"key\", models.CharField(db_index=True, max_length=1024, null=True)),\n                (\n                    \"anchor\",\n                    models.ForeignKey(\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"children\",\n                        to=\"lamindb.block\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"space\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.space\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.CreateModel(\n            name=\"BlockProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"block\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.block\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Collection\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_20,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"key\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=255\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"hash\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=22,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=25,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"_actions\",\n                    models.ManyToManyField(related_name=\"+\", to=\"lamindb.artifact\"),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"meta_artifact\",\n                    lamindb.base.fields.OneToOneField(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"_meta_of_collection\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n        ),\n        migrations.CreateModel(\n            name=\"CollectionArtifact\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_collection\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"collection\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_artifact\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"artifacts\",\n            field=models.ManyToManyField(\n                related_name=\"collections\",\n                through=\"lamindb.CollectionArtifact\",\n                to=\"lamindb.artifact\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"CollectionProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"collection\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"CollectionReference\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"collection\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_reference\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"Feature\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=150\n                    ),\n                ),\n                (\n                    \"_dtype_str\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"unit\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\"array_rank\", models.SmallIntegerField(db_index=True, default=0)),\n                (\"array_size\", models.IntegerField(db_index=True, default=0)),\n                (\n                    \"array_shape\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"synonyms\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"default_value\",\n                    lamindb.base.fields.JSONField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"nullable\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"coerce\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"features\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(lamindb.models.can_curate.CanCurate, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"CollectionRecord\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"collection\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_record\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_collectionrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactRun\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_run\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactrun\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactReference\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_reference\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactreference\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactRecord\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_record\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactproject\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"artifactartifact\",\n            name=\"feature\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifactartifact\",\n                to=\"lamindb.feature\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"FeatureProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"JsonValue\",\n            fields=[\n                (\n                    \"id\",\n                    models.BigAutoField(\n                        auto_created=True,\n                        primary_key=True,\n                        serialize=False,\n                        verbose_name=\"ID\",\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"value\", models.JSONField()),\n                (\n                    \"hash\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=22,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n                \"base_manager_name\": \"objects\",\n            },\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactJsonValue\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_jsonvalue\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"jsonvalue\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifact\",\n                        to=\"lamindb.jsonvalue\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"json_values\",\n            field=models.ManyToManyField(\n                related_name=\"artifacts\",\n                through=\"lamindb.ArtifactJsonValue\",\n                to=\"lamindb.jsonvalue\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Project\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=255\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"abbr\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=32,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"url\",\n                    lamindb.base.fields.URLField(\n                        blank=True, default=None, max_length=255, null=True\n                    ),\n                ),\n                (\n                    \"start_date\",\n                    lamindb.base.fields.DateField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"end_date\",\n                    lamindb.base.fields.DateField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"projects\",\n                        through=\"lamindb.ArtifactProject\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"blocks\",\n                    models.ManyToManyField(\n                        related_name=\"projects\",\n                        through=\"lamindb.BlockProject\",\n                        to=\"lamindb.block\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"collections\",\n                    models.ManyToManyField(\n                        related_name=\"projects\",\n                        through=\"lamindb.CollectionProject\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"features\",\n                    models.ManyToManyField(\n                        related_name=\"projects\",\n                        through=\"lamindb.FeatureProject\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"parents\",\n                    models.ManyToManyField(\n                        related_name=\"children\", to=\"lamindb.project\"\n                    ),\n                ),\n                (\n                    \"predecessors\",\n                    models.ManyToManyField(\n                        related_name=\"successors\", to=\"lamindb.project\"\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"projects\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(\n                lamindb.models.can_curate.CanCurate,\n                models.Model,\n                lamindb.models.sqlrecord.ValidateFields,\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"featureproject\",\n            name=\"project\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_feature\",\n                to=\"lamindb.project\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionproject\",\n            name=\"project\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_collection\",\n                to=\"lamindb.project\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"BranchProject\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_branch\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"branch\", \"project\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"projects\",\n            field=models.ManyToManyField(\n                related_name=\"branches\",\n                through=\"lamindb.BranchProject\",\n                to=\"lamindb.project\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"blockproject\",\n            name=\"project\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_block\",\n                to=\"lamindb.project\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactproject\",\n            name=\"project\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifact\",\n                to=\"lamindb.project\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Record\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=16,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=150,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"reference\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=25,\n                        null=True,\n                    ),\n                ),\n                (\"extra_data\", models.JSONField(null=True)),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"records\",\n                        through=\"lamindb.ArtifactRecord\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"collections\",\n                    models.ManyToManyField(\n                        related_name=\"records\",\n                        through=\"lamindb.CollectionRecord\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"parents\",\n                    models.ManyToManyField(\n                        related_name=\"children\", to=\"lamindb.record\"\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"records\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(\n                lamindb.models.has_parents.HasParents,\n                lamindb.models.can_curate.CanCurate,\n                models.Model,\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ProjectRecord\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_projectrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_record\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"records\",\n            field=models.ManyToManyField(\n                related_name=\"projects\",\n                through=\"lamindb.ProjectRecord\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionrecord\",\n            name=\"record\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_collection\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactrecord\",\n            name=\"record\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifact\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordArtifact\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordartifact\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_artifact\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"value\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_in_record\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"record\", \"feature\", \"value\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"linked_in_records\",\n            field=models.ManyToManyField(\n                related_name=\"linked_artifacts\",\n                through=\"lamindb.RecordArtifact\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordCollection\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordcollection\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_collection\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"value\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_in_record\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"record\", \"feature\", \"value\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"linked_in_records\",\n            field=models.ManyToManyField(\n                related_name=\"linked_collections\",\n                through=\"lamindb.RecordCollection\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordProject\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordproject\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_project\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"value\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_in_record\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"record\", \"feature\", \"value\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"linked_in_records\",\n            field=models.ManyToManyField(\n                related_name=\"linked_projects\",\n                through=\"lamindb.RecordProject\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordRecord\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_record\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"value\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_record\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"record\", \"feature\", \"value\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"linked_records\",\n            field=models.ManyToManyField(\n                related_name=\"linked_in_records\",\n                through=\"lamindb.RecordRecord\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordReference\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordreference\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_reference\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.CreateModel(\n            name=\"RecordRun\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordrun\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_run\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.CreateModel(\n            name=\"RecordTransform\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordtransform\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_transform\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.CreateModel(\n            name=\"RecordULabel\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordulabel\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_ulabel\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.CreateModel(\n            name=\"RecordUser\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recorduser\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_user\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.CreateModel(\n            name=\"Reference\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=255\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"abbr\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=32,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"url\",\n                    lamindb.base.fields.URLField(blank=True, db_index=True, null=True),\n                ),\n                (\n                    \"pubmed_id\",\n                    lamindb.base.fields.BigIntegerField(\n                        blank=True, db_index=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"doi\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                        validators=[\n                            django.core.validators.RegexValidator(\n                                message=\"Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)\",\n                                regex=\"^(?:https?://(?:dx\\\\.)?doi\\\\.org/|doi:|DOI:)?10\\\\.\\\\d+/.*$\",\n                            )\n                        ],\n                    ),\n                ),\n                (\n                    \"text\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"date\",\n                    lamindb.base.fields.DateField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"references\",\n                        through=\"lamindb.ArtifactReference\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"collections\",\n                    models.ManyToManyField(\n                        related_name=\"references\",\n                        through=\"lamindb.CollectionReference\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"linked_in_records\",\n                    models.ManyToManyField(\n                        related_name=\"linked_references\",\n                        through=\"lamindb.RecordReference\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"references\",\n                        to=\"lamindb.reference\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(\n                lamindb.models.can_curate.CanCurate,\n                models.Model,\n                lamindb.models.sqlrecord.ValidateFields,\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recordreference\",\n            name=\"value\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_in_record\",\n                to=\"lamindb.reference\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"references\",\n            field=models.ManyToManyField(\n                related_name=\"projects\", to=\"lamindb.reference\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionreference\",\n            name=\"reference\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_collection\",\n                to=\"lamindb.reference\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactreference\",\n            name=\"reference\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifact\",\n                to=\"lamindb.reference\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ReferenceRecord\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_referencerecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_reference\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"reference\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_record\",\n                        to=\"lamindb.reference\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"reference\",\n            name=\"records\",\n            field=models.ManyToManyField(\n                related_name=\"references\",\n                through=\"lamindb.ReferenceRecord\",\n                to=\"lamindb.record\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Run\",\n            fields=[\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=150,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"entrypoint\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"started_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"finished_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True, db_index=True, default=None, null=True\n                    ),\n                ),\n                (\"params\", models.JSONField(null=True)),\n                (\n                    \"reference\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=25,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"cli_args\",\n                    lamindb.base.fields.CharField(\n                        blank=True, default=None, max_length=1024, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_is_consecutive\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=-3, db_index=True, default=-3),\n                ),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"runs\",\n                        through=\"lamindb.ArtifactRun\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"environment\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"_environment_of\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"initiated_by_run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"initiated_runs\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"linked_in_records\",\n                    models.ManyToManyField(\n                        related_name=\"linked_runs\",\n                        through=\"lamindb.RecordRun\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"plan\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"_plan_for_runs\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"report\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"_report_of\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"referencerecord\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"reference\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recordrun\",\n            name=\"value\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_in_record\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"input_of_runs\",\n            field=models.ManyToManyField(\n                related_name=\"input_records\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                editable=False,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"output_records\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"projectrecord\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"jsonvalue\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"featureproject\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"feature\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionreference\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionrecord\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionproject\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionartifact\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"input_of_runs\",\n            field=models.ManyToManyField(\n                related_name=\"input_collections\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"recreating_runs\",\n            field=models.ManyToManyField(\n                related_name=\"recreated_collections\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"output_collections\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"blockproject\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactUser\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_user\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactuser\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactULabel\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_artifactulabel\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"artifactrun\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"links_artifact\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactreference\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactrecord\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactproject\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactjsonvalue\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactartifact\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.models.run.current_run,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"input_of_runs\",\n            field=models.ManyToManyField(\n                related_name=\"input_artifacts\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"recreating_runs\",\n            field=models.ManyToManyField(\n                related_name=\"recreated_artifacts\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"run\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                editable=False,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"output_artifacts\",\n                to=\"lamindb.run\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunArtifact\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_in_run\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_runartifact\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_artifact\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"run\", \"artifact\", \"feature\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"linked_artifacts\",\n            field=models.ManyToManyField(\n                related_name=\"linked_by_runs\",\n                through=\"lamindb.RunArtifact\",\n                to=\"lamindb.artifact\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunJsonValue\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"jsonvalue\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_run\",\n                        to=\"lamindb.jsonvalue\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_jsonvalue\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"json_values\",\n            field=models.ManyToManyField(\n                related_name=\"runs\",\n                through=\"lamindb.RunJsonValue\",\n                to=\"lamindb.jsonvalue\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunProject\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_run\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"runs\",\n            field=models.ManyToManyField(\n                related_name=\"projects\", through=\"lamindb.RunProject\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunRecord\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_runrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_run\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_record\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"runs\",\n            field=models.ManyToManyField(\n                related_name=\"records\", through=\"lamindb.RunRecord\", to=\"lamindb.run\"\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Schema\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=16,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=150,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"n_members\",\n                    lamindb.base.fields.IntegerField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"coerce\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"flexible\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"itype\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=120,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"otype\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=64,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"_dtype_str\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        default=None,\n                        editable=False,\n                        max_length=64,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"hash\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=22,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"minimal_set\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True, editable=False\n                    ),\n                ),\n                (\n                    \"ordered_set\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=False, editable=False\n                    ),\n                ),\n                (\n                    \"maximal_set\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=False, editable=False\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"schemas\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(lamindb.models.can_curate.CanCurate, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"schema\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                null=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"records\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactSchema\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"slot\",\n                    lamindb.base.fields.CharField(\n                        blank=True, default=None, max_length=255, null=True\n                    ),\n                ),\n                (\n                    \"feature_ref_is_semantic\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"artifact\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"_links_schema\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"schema\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"_links_artifact\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"schema\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"validated_artifacts\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"schemas\",\n            field=models.ManyToManyField(\n                related_name=\"artifacts\",\n                through=\"lamindb.ArtifactSchema\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"SchemaComponent\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"slot\",\n                    lamindb.base.fields.CharField(\n                        blank=True, default=None, max_length=255, null=True\n                    ),\n                ),\n                (\n                    \"component\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_composite\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n                (\n                    \"composite\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_component\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"schema\",\n            name=\"components\",\n            field=models.ManyToManyField(\n                related_name=\"composites\",\n                through=\"lamindb.SchemaComponent\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"SchemaFeature\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_schema\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"schema\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_feature\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"schema\", \"feature\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"feature\",\n            name=\"schemas\",\n            field=models.ManyToManyField(\n                related_name=\"features\",\n                through=\"lamindb.SchemaFeature\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"SchemaProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_schema\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"schema\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"schemas\",\n            field=models.ManyToManyField(\n                related_name=\"projects\",\n                through=\"lamindb.SchemaProject\",\n                to=\"lamindb.schema\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"schema\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"reference\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"jsonvalue\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"feature\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"space\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.space\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Storage\",\n            fields=[\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_12,\n                        editable=False,\n                        max_length=12,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"root\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=30\n                    ),\n                ),\n                (\n                    \"region\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=64,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"instance_uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=12,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"space\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.space\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"storage\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"artifacts\",\n                to=\"lamindb.storage\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"Transform\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=16,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"key\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=1024\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(\n                        blank=True, db_index=True, default=None, null=True\n                    ),\n                ),\n                (\n                    \"kind\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=\"pipeline\", max_length=20\n                    ),\n                ),\n                (\n                    \"source_code\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"hash\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=22,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=25,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"environment\",\n                    models.ForeignKey(\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"_environment_of_transforms\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"linked_in_records\",\n                    models.ManyToManyField(\n                        related_name=\"linked_transforms\",\n                        through=\"lamindb.RecordTransform\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"plan\",\n                    models.ForeignKey(\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"_plan_for_transforms\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"space\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.space\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"transform\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"runs\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recordtransform\",\n            name=\"value\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_in_record\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_transform\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"transform\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"transforms\",\n            field=models.ManyToManyField(\n                related_name=\"projects\",\n                through=\"lamindb.TransformProject\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformRecord\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_transformrecord\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_transform\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"transform\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_record\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"transforms\",\n            field=models.ManyToManyField(\n                related_name=\"records\",\n                through=\"lamindb.TransformRecord\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformReference\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"reference\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_transform\",\n                        to=\"lamindb.reference\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"transform\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_reference\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"reference\",\n            name=\"transforms\",\n            field=models.ManyToManyField(\n                related_name=\"references\",\n                through=\"lamindb.TransformReference\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformTransform\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\"config\", models.JSONField(default=None, null=True)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"predecessor\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_successor\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n                (\n                    \"successor\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_predecessor\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"transform\",\n            name=\"predecessors\",\n            field=models.ManyToManyField(\n                related_name=\"successors\",\n                through=\"lamindb.TransformTransform\",\n                to=\"lamindb.transform\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ULabel\",\n            fields=[\n                (\n                    \"is_type\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, db_index=True, default=False\n                    ),\n                ),\n                (\n                    \"is_locked\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_default=False, default=False\n                    ),\n                ),\n                (\n                    \"_aux\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None, null=True\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=lamindb.base.uids.base62_8,\n                        editable=False,\n                        max_length=8,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=150\n                    ),\n                ),\n                (\n                    \"description\",\n                    lamindb.base.fields.TextField(blank=True, default=None, null=True),\n                ),\n                (\n                    \"reference\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=255,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"reference_type\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=25,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"ulabels\",\n                        through=\"lamindb.ArtifactULabel\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"linked_in_records\",\n                    models.ManyToManyField(\n                        related_name=\"linked_ulabels\",\n                        through=\"lamindb.RecordULabel\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"parents\",\n                    models.ManyToManyField(\n                        related_name=\"children\", to=\"lamindb.ulabel\"\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"space\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.space\",\n                    ),\n                ),\n                (\n                    \"type\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"ulabels\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            options={\n                \"abstract\": False,\n            },\n            bases=(\n                lamindb.models.has_parents.HasParents,\n                lamindb.models.can_curate.CanCurate,\n                models.Model,\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformULabel\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"transform\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n                (\n                    \"ulabel\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_transform\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"transform\",\n            name=\"ulabels\",\n            field=models.ManyToManyField(\n                related_name=\"transforms\",\n                through=\"lamindb.TransformULabel\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunULabel\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"ulabel\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_run\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"ulabels\",\n            field=models.ManyToManyField(\n                related_name=\"runs\", through=\"lamindb.RunULabel\", to=\"lamindb.ulabel\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recordulabel\",\n            name=\"value\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_record\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"CollectionULabel\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"collection\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=None,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_collectionulabel\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"ulabel\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_collection\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"ulabels\",\n            field=models.ManyToManyField(\n                related_name=\"collections\",\n                through=\"lamindb.CollectionULabel\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"BranchULabel\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"ulabel\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_branch\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"branch\", \"ulabel\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"ulabels\",\n            field=models.ManyToManyField(\n                related_name=\"branches\",\n                through=\"lamindb.BranchULabel\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactulabel\",\n            name=\"ulabel\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifact\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ULabelProject\",\n            fields=[\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_ulabel\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        default=lamindb.models.run.current_run,\n                        null=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"ulabel\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_project\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n            ],\n            bases=(lamindb.models.sqlrecord.IsLink, models.Model),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"ulabels\",\n            field=models.ManyToManyField(\n                related_name=\"projects\",\n                through=\"lamindb.ULabelProject\",\n                to=\"lamindb.ulabel\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"User\",\n            fields=[\n                (\"id\", models.AutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        editable=False,\n                        max_length=8,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"handle\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        unique=True,\n                    ),\n                ),\n                (\n                    \"name\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=150,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"updated_at\",\n                    lamindb.base.fields.DateTimeField(\n                        blank=True,\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"artifacts\",\n                    models.ManyToManyField(\n                        related_name=\"users\",\n                        through=\"lamindb.ArtifactUser\",\n                        to=\"lamindb.artifact\",\n                        through_fields=(\"user\", \"artifact\"),\n                    ),\n                ),\n                (\n                    \"linked_in_records\",\n                    models.ManyToManyField(\n                        related_name=\"linked_users\",\n                        through=\"lamindb.RecordUser\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.can_curate.CanCurate),\n        ),\n        migrations.AddField(\n            model_name=\"ulabelproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ULabelBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"ulabel\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.ulabel\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=lamindb.base.users.current_user_id,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"ulabel\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformulabel\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformtransform\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformreference\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformrecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"transformproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"TransformBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\"line_number\", models.IntegerField(null=True)),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"transform\",\n                    models.ForeignKey(\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.transform\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=lamindb.base.users.current_user_id,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"transform\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"created_transforms\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"storage\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"SpaceBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"space\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.space\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=lamindb.base.users.current_user_id,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"space\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=None,\n                null=True,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"schemaproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"schemacomponent\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"SchemaBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"schema\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.schema\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=lamindb.base.users.current_user_id,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"schema\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"runulabel\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"runrecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"runproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"runjsonvalue\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RunBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"run\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.run\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=lamindb.base.users.current_user_id,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"run\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.CASCADE,\n                related_name=\"created_runs\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"referencerecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"reference\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"recorduser\",\n            name=\"value\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_record\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=lamindb.base.users.current_user_id,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"record\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ProjectUser\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"role\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=32\n                    ),\n                ),\n                (\n                    \"project\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_user\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"user\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_project\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AddField(\n            model_name=\"projectrecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ProjectBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"project\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.project\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=lamindb.base.users.current_user_id,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"project\",\n            name=\"users\",\n            field=models.ManyToManyField(\n                related_name=\"projects\",\n                through=\"lamindb.ProjectUser\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"jsonvalue\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"featureproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"FeatureBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"feature\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=lamindb.base.users.current_user_id,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"feature\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionulabel\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionreference\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionrecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collectionproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"CollectionBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"collection\",\n                    models.ForeignKey(\n                        null=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.collection\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=lamindb.base.users.current_user_id,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"collectionartifact\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"collection\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"BranchUser\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"role\",\n                    lamindb.base.fields.CharField(\n                        blank=True, db_index=True, default=None, max_length=32\n                    ),\n                ),\n                (\n                    \"branch\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"links_user\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"user\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_branch\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.CreateModel(\n            name=\"BranchBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=lamindb.base.users.current_user_id,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"branch\",\n            name=\"users\",\n            field=models.ManyToManyField(\n                related_name=\"branches\", through=\"lamindb.BranchUser\", to=\"lamindb.user\"\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"blockproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"block\",\n            name=\"created_by\",\n            field=models.ForeignKey(\n                default=lamindb.base.users.current_user_id,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactuser\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactuser\",\n            name=\"user\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"links_artifact\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactulabel\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactschema\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactrun\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactreference\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactrecord\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactproject\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifactjsonvalue\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"ArtifactBlock\",\n            fields=[\n                (\n                    \"version_tag\",\n                    lamindb.base.fields.CharField(\n                        blank=True,\n                        db_index=True,\n                        default=None,\n                        max_length=30,\n                        null=True,\n                    ),\n                ),\n                (\n                    \"is_latest\",\n                    lamindb.base.fields.BooleanField(\n                        blank=True, db_index=True, default=True\n                    ),\n                ),\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"uid\",\n                    models.CharField(\n                        db_index=True,\n                        default=lamindb.base.uids.base62_16,\n                        editable=False,\n                        max_length=20,\n                        unique=True,\n                    ),\n                ),\n                (\"content\", models.TextField()),\n                (\"hash\", models.CharField(db_index=True, max_length=22, null=True)),\n                (\n                    \"kind\",\n                    models.CharField(\n                        db_default=\"readme\",\n                        db_index=True,\n                        default=\"readme\",\n                        max_length=22,\n                    ),\n                ),\n                (\n                    \"created_at\",\n                    models.DateTimeField(\n                        db_default=django.db.models.functions.datetime.Now(),\n                        db_index=True,\n                        editable=False,\n                    ),\n                ),\n                (\n                    \"_status_code\",\n                    models.SmallIntegerField(db_default=0, db_index=True, default=0),\n                ),\n                (\"_aux\", models.JSONField(db_default=None, default=None, null=True)),\n                (\n                    \"artifact\",\n                    models.ForeignKey(\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"ablocks\",\n                        to=\"lamindb.artifact\",\n                    ),\n                ),\n                (\n                    \"branch\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_on\",\n                    models.ForeignKey(\n                        db_default=1,\n                        default=1,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.branch\",\n                    ),\n                ),\n                (\n                    \"created_by\",\n                    models.ForeignKey(\n                        default=lamindb.base.users.current_user_id,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"+\",\n                        to=\"lamindb.user\",\n                    ),\n                ),\n            ],\n        ),\n        migrations.AddField(\n            model_name=\"artifactartifact\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.AddField(\n            model_name=\"artifact\",\n            name=\"created_by\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                default=lamindb.base.users.current_user_id,\n                editable=False,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"created_artifacts\",\n                to=\"lamindb.user\",\n            ),\n        ),\n        migrations.CreateModel(\n            name=\"RecordJson\",\n            fields=[\n                (\"id\", models.BigAutoField(primary_key=True, serialize=False)),\n                (\n                    \"value\",\n                    lamindb.base.fields.JSONField(\n                        blank=True, db_default=None, default=None\n                    ),\n                ),\n                (\n                    \"feature\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.PROTECT,\n                        related_name=\"links_recordjson\",\n                        to=\"lamindb.feature\",\n                    ),\n                ),\n                (\n                    \"record\",\n                    lamindb.base.fields.ForeignKey(\n                        blank=True,\n                        on_delete=django.db.models.deletion.CASCADE,\n                        related_name=\"values_json\",\n                        to=\"lamindb.record\",\n                    ),\n                ),\n            ],\n            options={\n                \"unique_together\": {(\"record\", \"feature\")},\n            },\n            bases=(models.Model, lamindb.models.sqlrecord.IsLink),\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"recordreference\",\n            unique_together={(\"record\", \"feature\", \"value\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"recordrun\",\n            unique_together={(\"record\", \"feature\", \"value\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"recordtransform\",\n            unique_together={(\"record\", \"feature\", \"value\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"recordulabel\",\n            unique_together={(\"record\", \"feature\", \"value\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"ulabelproject\",\n            unique_together={(\"ulabel\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transformulabel\",\n            unique_together={(\"transform\", \"ulabel\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transformtransform\",\n            unique_together={(\"successor\", \"predecessor\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transformreference\",\n            unique_together={(\"transform\", \"reference\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transformrecord\",\n            unique_together={(\"transform\", \"record\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transformproject\",\n            unique_together={(\"transform\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"transform\",\n            unique_together={(\"key\", \"hash\")},\n        ),\n        migrations.AddConstraint(\n            model_name=\"space\",\n            constraint=models.UniqueConstraint(\n                django.db.models.functions.text.Lower(\"name\"),\n                name=\"unique_space_name_lower\",\n            ),\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"schemaproject\",\n            unique_together={(\"schema\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"schemacomponent\",\n            unique_together={(\"composite\", \"slot\"), (\"composite\", \"slot\", \"component\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"runulabel\",\n            unique_together={(\"run\", \"ulabel\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"runrecord\",\n            unique_together={(\"run\", \"record\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"runproject\",\n            unique_together={(\"run\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"runjsonvalue\",\n            unique_together={(\"run\", \"jsonvalue\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"referencerecord\",\n            unique_together={(\"reference\", \"feature\", \"record\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"recorduser\",\n            unique_together={(\"record\", \"feature\", \"value\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"projectuser\",\n            unique_together={(\"project\", \"user\", \"role\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"projectrecord\",\n            unique_together={(\"project\", \"feature\", \"record\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"jsonvalue\",\n            unique_together={(\"feature\", \"hash\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"featureproject\",\n            unique_together={(\"feature\", \"project\")},\n        ),\n        migrations.AddConstraint(\n            model_name=\"feature\",\n            constraint=models.CheckConstraint(\n                condition=models.Q(\n                    (\"is_type\", True), (\"_dtype_str__isnull\", False), _connector=\"OR\"\n                ),\n                name=\"feature_dtype_str_not_null_when_is_type_false\",\n            ),\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"collectionulabel\",\n            unique_together={(\"collection\", \"ulabel\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"collectionreference\",\n            unique_together={(\"collection\", \"reference\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"collectionrecord\",\n            unique_together={(\"collection\", \"record\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"collectionproject\",\n            unique_together={(\"collection\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"collectionartifact\",\n            unique_together={(\"collection\", \"artifact\")},\n        ),\n        migrations.AddConstraint(\n            model_name=\"collection\",\n            constraint=models.UniqueConstraint(\n                fields=(\"key\", \"hash\"), name=\"unique_collection_key_hash_not_null\"\n            ),\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"branchuser\",\n            unique_together={(\"branch\", \"user\", \"role\")},\n        ),\n        migrations.AddConstraint(\n            model_name=\"branch\",\n            constraint=models.UniqueConstraint(\n                django.db.models.functions.text.Lower(\"name\"),\n                name=\"unique_branch_name_lower\",\n            ),\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"blockproject\",\n            unique_together={(\"block\", \"project\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactuser\",\n            unique_together={(\"artifact\", \"user\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactulabel\",\n            unique_together={(\"artifact\", \"ulabel\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactschema\",\n            unique_together={(\"artifact\", \"schema\"), (\"artifact\", \"slot\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactrun\",\n            unique_together={(\"artifact\", \"run\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactreference\",\n            unique_together={(\"artifact\", \"reference\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactrecord\",\n            unique_together={(\"artifact\", \"record\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactproject\",\n            unique_together={(\"artifact\", \"project\", \"feature\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactjsonvalue\",\n            unique_together={(\"artifact\", \"jsonvalue\")},\n        ),\n        migrations.AlterUniqueTogether(\n            name=\"artifactartifact\",\n            unique_together={(\"artifact\", \"value\", \"feature\")},\n        ),\n        migrations.AddConstraint(\n            model_name=\"artifact\",\n            constraint=models.UniqueConstraint(\n                condition=models.Q((\"key__isnull\", False)),\n                fields=(\"storage\", \"key\", \"hash\"),\n                name=\"unique_artifact_storage_key_hash_not_null\",\n            ),\n        ),\n        migrations.AddConstraint(\n            model_name=\"artifact\",\n            constraint=models.UniqueConstraint(\n                condition=models.Q((\"key__isnull\", True)),\n                fields=(\"storage\", \"hash\"),\n                name=\"unique_artifact_storage_hash_null_key\",\n            ),\n        ),\n        migrations.RunPython(apply_constraints),\n    ]\n\n\nif connection.vendor == \"postgresql\":\n    Migration.operations += [\n        pgtrigger.migrations.AddTrigger(\n            model_name=\"ulabel\",\n            trigger=pgtrigger.compiler.Trigger(\n                name=\"prevent_ulabel_type_cycle\",\n                sql=pgtrigger.compiler.UpsertTriggerSql(\n                    condition=\"WHEN (NEW.type_id IS NOT NULL)\",\n                    func=\"\\n                        -- Check for direct self-reference\\n                        IF NEW.type_id = NEW.id THEN\\n                            RAISE EXCEPTION 'Cannot set type: ulabel cannot be its own type';\\n                        END IF;\\n\\n                        -- Check for cycles in the type chain\\n                        IF EXISTS (\\n                            WITH RECURSIVE type_chain AS (\\n                                SELECT type_id, 1 as depth\\n                                FROM lamindb_ulabel\\n                                WHERE id = NEW.type_id\\n\\n                                UNION ALL\\n\\n                                SELECT r.type_id, tc.depth + 1\\n                                FROM lamindb_ulabel r\\n                                INNER JOIN type_chain tc ON r.id = tc.type_id\\n                                WHERE tc.depth < 100\\n                            )\\n                            SELECT 1 FROM type_chain WHERE type_id = NEW.id\\n                        ) THEN\\n                            RAISE EXCEPTION 'Cannot set type: would create a cycle';\\n                        END IF;\\n\\n                        RETURN NEW;\\n                    \",\n                    hash=\"53487a8e36a64748418457f7229de6d5cf31e6bd\",\n                    operation=\"UPDATE OR INSERT\",\n                    pgid=\"pgtrigger_prevent_ulabel_type_cycle_863ae\",\n                    table=\"lamindb_ulabel\",\n                    when=\"BEFORE\",\n                ),\n            ),\n        ),\n        pgtrigger.migrations.AddTrigger(\n            model_name=\"record\",\n            trigger=pgtrigger.compiler.Trigger(\n                name=\"prevent_record_type_cycle\",\n                sql=pgtrigger.compiler.UpsertTriggerSql(\n                    condition=\"WHEN (NEW.type_id IS NOT NULL)\",\n                    func=\"\\n                        -- Check for direct self-reference\\n                        IF NEW.type_id = NEW.id THEN\\n                            RAISE EXCEPTION 'Cannot set type: record cannot be its own type';\\n                        END IF;\\n\\n                        -- Check for cycles in the type chain\\n                        IF EXISTS (\\n                            WITH RECURSIVE type_chain AS (\\n                                SELECT type_id, 1 as depth\\n                                FROM lamindb_record\\n                                WHERE id = NEW.type_id\\n\\n                                UNION ALL\\n\\n                                SELECT r.type_id, tc.depth + 1\\n                                FROM lamindb_record r\\n                                INNER JOIN type_chain tc ON r.id = tc.type_id\\n                                WHERE tc.depth < 100\\n                            )\\n                            SELECT 1 FROM type_chain WHERE type_id = NEW.id\\n                        ) THEN\\n                            RAISE EXCEPTION 'Cannot set type: would create a cycle';\\n                        END IF;\\n\\n                        RETURN NEW;\\n                    \",\n                    hash=\"deaab832a066dfec76228f5b7a62a08f334876a9\",\n                    operation=\"UPDATE OR INSERT\",\n                    pgid=\"pgtrigger_prevent_record_type_cycle_56c18\",\n                    table=\"lamindb_record\",\n                    when=\"BEFORE\",\n                ),\n            ),\n        ),\n        pgtrigger.migrations.AddTrigger(\n            model_name=\"feature\",\n            trigger=pgtrigger.compiler.Trigger(\n                name=\"update_feature_on_name_change\",\n                sql=pgtrigger.compiler.UpsertTriggerSql(\n                    condition=\"WHEN (OLD.name IS DISTINCT FROM NEW.name)\",\n                    func=\"DECLARE\\n    old_renamed JSONB;\\n    new_renamed JSONB;\\n    ts TEXT;\\nBEGIN\\n    -- Only proceed if name actually changed\\n    IF OLD.name IS DISTINCT FROM NEW.name THEN\\n        -- Update synonyms\\n        IF NEW.synonyms IS NULL OR NEW.synonyms = '' THEN\\n            NEW.synonyms := OLD.name;\\n        ELSIF position(OLD.name in NEW.synonyms) = 0 THEN\\n            NEW.synonyms := NEW.synonyms || '|' || OLD.name;\\n        END IF;\\n\\n        -- Update _aux with rename history\\n        ts := TO_CHAR(NOW() AT TIME ZONE 'UTC', 'YYYY-MM-DD\\\"T\\\"HH24:MI:SS\\\"Z\\\"');\\n\\n        -- Get existing renamed history or initialize empty object\\n        old_renamed := COALESCE((OLD._aux->>'renamed')::JSONB, '{}'::JSONB);\\n\\n        -- Add old name with timestamp\\n        new_renamed := old_renamed || jsonb_build_object(ts, OLD.name);\\n\\n        -- Update _aux with new renamed history\\n        IF NEW._aux IS NULL THEN\\n            NEW._aux := jsonb_build_object('renamed', new_renamed);\\n        ELSE\\n            NEW._aux := NEW._aux || jsonb_build_object('renamed', new_renamed);\\n        END IF;\\n    END IF;\\n\\n    RETURN NEW;\\nEND;\\n\",\n                    hash=\"5f2e7a65e42c34b0455f0840def52f078726e401\",\n                    operation=\"UPDATE\",\n                    pgid=\"pgtrigger_update_feature_on_name_change_6c32d\",\n                    table=\"lamindb_feature\",\n                    when=\"BEFORE\",\n                ),\n            ),\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/0187_v2_4_part_2.py",
    "content": "# Generated by Django 5.2 on 2026-04-16 06:38\n\nimport django.db.models.deletion\nfrom django.db import migrations\n\nimport lamindb.base.fields\n\n\nclass Migration(migrations.Migration):\n    dependencies = [\n        (\"lamindb\", \"0186_v2_4\"),\n    ]\n\n    operations = [\n        migrations.RemoveField(\n            model_name=\"branchblock\",\n            name=\"created_on\",\n        ),\n        migrations.AlterField(\n            model_name=\"block\",\n            name=\"branch\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n        migrations.AlterField(\n            model_name=\"block\",\n            name=\"created_on\",\n            field=lamindb.base.fields.ForeignKey(\n                blank=True,\n                db_default=1,\n                default=1,\n                on_delete=django.db.models.deletion.PROTECT,\n                related_name=\"+\",\n                to=\"lamindb.branch\",\n            ),\n        ),\n    ]\n"
  },
  {
    "path": "lamindb/migrations/README.md",
    "content": "# Attention\n\nRemember that lamindb schema changes that do not work on old databases (like adding columns or tables) cannot be deployed to cloud functions unless these instances are migrated.\n"
  },
  {
    "path": "lamindb/migrations/__init__.py",
    "content": ""
  },
  {
    "path": "lamindb/models/__init__.py",
    "content": "\"\"\"Auxiliary models & database library.\n\nRegistry basics\n---------------\n\n.. autoclass:: BaseSQLRecord\n.. autoclass:: SQLRecord\n.. autoclass:: Registry\n.. autoclass:: BasicQuerySet\n.. autoclass:: QuerySet\n\nMixins for registries\n---------------------\n\n.. autoclass:: IsVersioned\n.. autoclass:: HasType\n.. autoclass:: HasParents\n.. autoclass:: CanCurate\n.. autoclass:: TracksRun\n.. autoclass:: TracksUpdates\n\nManagers\n--------\n\n.. autoclass:: FeatureManager\n.. autoclass:: LabelManager\n.. autoclass:: QueryManager\n.. autoclass:: RelatedManager\n\nAnnotations of objects\n----------------------\n\nArtifact, run, collection, annotations can be conditioned on features.\nBesides linking categorical data, you can also link simple data types\nby virtue of the `JsonValue` model.\n\n.. autoclass:: JsonValue\n\nAnnotating artifacts.\n\n.. autoclass:: ArtifactArtifact\n.. autoclass:: ArtifactJsonValue\n.. autoclass:: ArtifactProject\n.. autoclass:: ArtifactRecord\n.. autoclass:: ArtifactReference\n.. autoclass:: ArtifactRun\n.. autoclass:: ArtifactSchema\n.. autoclass:: ArtifactULabel\n.. autoclass:: ArtifactUser\n\nAnnotating collections.\n\n.. autoclass:: CollectionArtifact\n.. autoclass:: CollectionProject\n.. autoclass:: CollectionReference\n.. autoclass:: CollectionULabel\n.. autoclass:: CollectionRecord\n\nAnnotating runs.\n\n.. autoclass:: RunJsonValue\n.. autoclass:: RunProject\n.. autoclass:: RunULabel\n.. autoclass:: RunRecord\n\nAnnotating transforms.\n\n.. autoclass:: TransformProject\n.. autoclass:: TransformReference\n.. autoclass:: TransformULabel\n\nBuilding relationships among transforms.\n\n.. autoclass:: TransformTransform\n\nAnnotating features, blocks, and ulabels with projects.\n\n.. autoclass:: FeatureProject\n.. autoclass:: BlockProject\n.. autoclass:: ULabelProject\n.. autoclass:: SchemaProject\n.. autoclass:: ProjectRecord\n\nBuilding schemas.\n\n.. autoclass:: SchemaComponent\n.. autoclass:: SchemaFeature\n\nAnnotating references with records.\n\n.. autoclass:: ReferenceRecord\n\nRecord values\n-------------\n\nRecord values work almost exactly like artifact and run annotations,\nwith the exception that JSON values are stored in `RecordJson` on a per-record basis\nand not in `JsonValue`.\n\n.. autoclass:: RecordArtifact\n.. autoclass:: RecordCollection\n.. autoclass:: RecordJson\n.. autoclass:: RecordProject\n.. autoclass:: RecordRecord\n.. autoclass:: RecordReference\n.. autoclass:: RecordRun\n.. autoclass:: RecordTransform\n.. autoclass:: RecordULabel\n.. autoclass:: RecordUser\n.. autoclass:: TransformRecord\n\nBlocks\n------\n\n.. autoclass:: BaseBlock\n.. autoclass:: Block\n.. autoclass:: ArtifactBlock\n.. autoclass:: BranchBlock\n.. autoclass:: CollectionBlock\n.. autoclass:: FeatureBlock\n.. autoclass:: ProjectBlock\n.. autoclass:: RecordBlock\n.. autoclass:: RunBlock\n.. autoclass:: SchemaBlock\n.. autoclass:: SpaceBlock\n.. autoclass:: TransformBlock\n.. autoclass:: ULabelBlock\n\nUtils\n-----\n\n.. autoclass:: LazyArtifact\n.. autoclass:: InspectResult\n.. autoclass:: ValidateFields\n.. autoclass:: SchemaOptionals\n.. autoclass:: lamindb.models.query_set.BiontyDB\n.. autoclass:: lamindb.models.query_set.PertdbDB\n\n\"\"\"\n\n# ruff: noqa: I001\n\nfrom lamin_utils._inspect import InspectResult\nfrom ._is_versioned import IsVersioned\nfrom .can_curate import CanCurate\nfrom .sqlrecord import (\n    BaseSQLRecord,\n    SQLRecord,\n    Registry,\n    Space,\n    Branch,\n    Migration,\n    ValidateFields,\n    format_field_value,\n    IsLink,\n    HasType,\n)\nfrom .storage import Storage\nfrom .transform import Transform, TransformTransform\nfrom .run import Run, TracksRun, TracksUpdates, current_run, User\nfrom .feature import Feature, JsonValue\nfrom .schema import Schema\nfrom .ulabel import ULabel\n\n# should come last as it needs everything else\nfrom .artifact import Artifact, LazyArtifact\nfrom ._feature_manager import FeatureManager\nfrom ._label_manager import LabelManager\nfrom .collection import Collection, CollectionArtifact\nfrom .project import Project, Reference\nfrom .query_manager import RelatedManager, QueryManager\nfrom .query_set import BasicQuerySet, QuerySet, DB, SQLRecordList\nfrom .artifact_set import ArtifactSet\nfrom .has_parents import HasParents\nfrom datetime import datetime as _datetime\n\n# link models\nfrom .artifact import ArtifactJsonValue, ArtifactArtifact, ArtifactUser, ArtifactRun\nfrom .project import (\n    ArtifactProject,\n    ArtifactReference,\n    BlockProject,\n    CollectionProject,\n    CollectionReference,\n    FeatureProject,\n    ProjectRecord,\n    RecordProject,\n    RecordReference,\n    ReferenceRecord,\n    RunProject,\n    SchemaProject,\n    TransformProject,\n    TransformReference,\n    ULabelProject,\n)\nfrom .run import RunJsonValue\nfrom .schema import (\n    SchemaFeature,\n    ArtifactSchema,\n    SchemaComponent,\n    SchemaOptionals,\n)\nfrom .ulabel import ArtifactULabel, TransformULabel, RunULabel, CollectionULabel\n\nfrom .record import (\n    Record,\n    ArtifactRecord,\n    CollectionRecord,\n    RecordArtifact,\n    RecordCollection,\n    RecordJson,\n    RecordRecord,\n    RecordRun,\n    RecordTransform,\n    RecordULabel,\n    RecordUser,\n    RunRecord,\n    TransformRecord,\n)\nfrom .block import (\n    BaseBlock,\n    Block,\n    ArtifactBlock,\n    BranchBlock,\n    CollectionBlock,\n    FeatureBlock,\n    ProjectBlock,\n    RecordBlock,\n    RunBlock,\n    SchemaBlock,\n    SpaceBlock,\n    TransformBlock,\n    ULabelBlock,\n)\n\nFeatureValue = JsonValue  # backward compatibility\n"
  },
  {
    "path": "lamindb/models/_describe.py",
    "content": "from __future__ import annotations\n\nimport re\nfrom types import SimpleNamespace\nfrom typing import TYPE_CHECKING, Literal\n\nfrom django.db import connections\nfrom django.db.models import Q\nfrom lamin_utils import colors, logger\nfrom rich.table import Column, Table\nfrom rich.text import Text\nfrom rich.tree import Tree\n\nfrom lamindb.models import BaseSQLRecord, Branch, Run\n\nfrom ._is_versioned import IsVersioned\nfrom .sqlrecord import SQLRecord, format_field_value\n\nif TYPE_CHECKING:\n    from lamindb.models import Artifact, Collection, Record, Schema, Transform\n\n    from .run import TracksRun\n\n\n# Define consistent column widths for use in other modules\nNAME_WIDTH = 30\nTYPE_WIDTH = 35  # types can get long, e.g. cat[Record[Treatment]]\nVALUES_WIDTH = 40\n\n\ndef strip_ansi_from_string(text: str) -> str:\n    \"\"\"Remove ANSI escape sequences from a string.\"\"\"\n    ansi_escape = re.compile(r\"\\x1B(?:[@-Z\\\\-_]|\\[[0-?]*[ -/]*[@-~])\")\n    return ansi_escape.sub(\"\", text)\n\n\ndef format_rich_tree(\n    tree: Tree, return_str: bool = False, strip_ansi: bool = True\n) -> str | None:\n    from rich.console import Console\n\n    from ..core._context import is_run_from_ipython\n\n    console = Console(force_terminal=True)\n    printed = False\n\n    if return_str:\n        from io import StringIO\n\n        string_io = StringIO()\n        str_console = Console(file=string_io, force_terminal=True)\n        str_console.print(tree)\n        result = string_io.getvalue()\n        if strip_ansi:\n            result = strip_ansi_from_string(result)\n        # rstrip trailing whitespace on every line\n        result = \"\\n\".join(line.rstrip() for line in result.splitlines())\n        return result\n\n    try:\n        if not is_run_from_ipython:\n            from IPython import get_ipython\n            from IPython.core.interactiveshell import InteractiveShell\n            from IPython.display import display\n\n            shell = get_ipython()\n            if isinstance(shell, InteractiveShell):\n                display(tree)\n                printed = True\n                return None\n    except (NameError, ImportError):\n        pass\n\n    if not printed:\n        # be careful to test this on a terminal\n        console = Console(force_terminal=True)\n        console.print(tree)\n\n    return None\n\n\ndef format_run_title(\n    record: Run | SimpleNamespace | None,\n    transform_key: str | None = None,\n    dim: bool = False,\n) -> Text:\n    if record is None:\n        return Text(\"\")\n    display_name = (\n        Text(record.name, style=\"cyan3\")\n        if record.name is not None\n        else Text(record.uid[:7], style=\"cyan3\")\n    )\n    if transform_key is None:\n        transform_key = record.transform.key\n    title = Text.assemble(\n        display_name,\n        (\" (\", \"dim\"),\n        (transform_key, \"cyan3\"),\n        (\")\", \"dim\"),\n    )\n    return title\n\n\ndef format_title_with_version(\n    record: IsVersioned | SimpleNamespace,\n) -> Text:\n    title_str = record.key if record.key is not None else \"\"\n    title = Text.assemble(\n        (title_str, \"cyan3\"),\n        (f\" ({record.version})\", \"dim\"),\n        Text.assemble((\"\\n|   description: \", \"dim\"), record.description)\n        if record.description\n        else Text(\"\"),\n    )\n    return title\n\n\ndef describe_header(record: BaseSQLRecord) -> Tree:\n    if isinstance(record, IsVersioned) and not record.is_latest:\n        logger.warning(\n            f\"This is not the latest version of the {record.__class__.__name__}.\"\n        )\n    if isinstance(record, SQLRecord):\n        if record.branch_id == 0:\n            logger.warning(\"This artifact is archived.\")\n        elif record.branch_id == -1:\n            logger.warning(\"This artifact is in the trash.\")\n    if isinstance(record, Run):\n        title = format_run_title(record, dim=True)  # dim makes the uid grey\n    elif isinstance(record, IsVersioned) or isinstance(record, SimpleNamespace):\n        title = format_title_with_version(record)\n    else:\n        display_field = (\n            record._name_field\n            if hasattr(record, \"_name_field\")\n            else \"name\"\n            if hasattr(record, \"name\")\n            else \"\"\n        )\n        display_value = getattr(record, display_field, None) if display_field else None\n        if display_value in (None, \"\"):\n            display_value = record.uid[:7] if hasattr(record, \"uid\") else \"\"\n        title = Text.assemble(\n            (\n                str(display_value),\n                \"cyan3\",\n            )\n        )\n    tree = Tree(\n        Text.assemble(\n            (f\"{record.__class__.__name__}: \", \"bold\"),\n            title,\n        ),\n        guide_style=\"dim\",  # dim the connecting lines\n    )\n    return tree\n\n\ndef format_bytes(bytes_value):\n    \"\"\"Convert bytes to human readable format.\"\"\"\n    if bytes_value < 1024:\n        return f\"{bytes_value} B\"\n    elif bytes_value < 1024**2:\n        return f\"{bytes_value / 1024:.1f} KB\"\n    elif bytes_value < 1024**3:\n        return f\"{bytes_value / (1024**2):.1f} MB\"\n    elif bytes_value < 1024**4:\n        return f\"{bytes_value / (1024**3):.1f} GB\"\n    else:\n        return f\"{bytes_value / (1024**4):.1f} TB\"\n\n\ndef append_uid_run(record: TracksRun, two_column_items: list, fk_data=None) -> None:\n    if fk_data and \"run\" in fk_data and fk_data[\"run\"] and fk_data[\"run\"][\"id\"]:\n        run, transform_key = (\n            SimpleNamespace(**fk_data[\"run\"]),\n            fk_data[\"run\"][\"transform_key\"],\n        )\n    elif record.run is not None:\n        run, transform_key = record.run, record.run.transform.key\n    else:\n        run, transform_key = None, None\n    text_uid = Text.assemble((\"uid: \", \"dim\"), f\"{record.uid}\")\n    text_run = Text.assemble(\n        (\"run: \", \"dim\"), format_run_title(run, transform_key=transform_key)\n    )\n    two_column_items.append(text_uid)\n    two_column_items.append(text_run)\n\n\ndef append_branch_space_created_at_created_by(\n    record: SQLRecord, two_column_items, fk_data=None\n):\n    # branch\n    branch_name = fk_data[\"branch\"][\"name\"] if fk_data else record.branch.name\n    two_column_items.append(Text.assemble((\"branch: \", \"dim\"), branch_name))\n    # space\n    space_name = fk_data[\"space\"][\"name\"] if fk_data else record.space.name\n    two_column_items.append(Text.assemble((\"space: \", \"dim\"), space_name))\n    # created_at\n    two_column_items.append(\n        Text.assemble((\"created_at: \", \"dim\"), format_field_value(record.created_at))\n    )\n    # created_by / \"name\" in fk_data holds handle, is display name\n    created_by_handle = (\n        fk_data[\"created_by\"][\"name\"] if fk_data else record.created_by.handle\n    )\n    two_column_items.append(Text.assemble((\"created_by: \", \"dim\"), created_by_handle))\n\n\ndef add_two_column_items_to_tree(tree: Tree, two_column_items: list) -> None:\n    table = Table(\n        Column(\"\", no_wrap=True),\n        Column(\"\", no_wrap=True),\n        show_header=False,\n        box=None,\n        pad_edge=False,\n    )\n    for i in range(0, len(two_column_items), 2):\n        if i + 1 < len(two_column_items):\n            left_item = two_column_items[i]\n            right_item = two_column_items[i + 1]\n            table.add_row(left_item, right_item)\n        else:\n            table.add_row(two_column_items[i], \"\")\n    tree.add(table)\n\n\ndef describe_artifact(\n    record: Artifact,\n    related_data: dict | None = None,\n) -> Tree:\n    from ._feature_manager import describe_features\n    from ._label_manager import describe_labels\n\n    if related_data is not None:\n        fk_data = related_data.get(\"fk\", {})\n    else:\n        fk_data = {}\n    tree = describe_header(record)\n    dataset_features_tree, external_features_tree = describe_features(\n        record,\n        related_data=related_data,\n    )\n    labels_tree = describe_labels(record, related_data=related_data)\n    two_column_items = []  # type: ignore\n    append_uid_run(record, two_column_items, fk_data)\n    if record.kind or record.otype:\n        two_column_items.append(Text.assemble((\"kind: \", \"dim\"), f\"{record.kind}\"))\n        two_column_items.append(Text.assemble((\"otype: \", \"dim\"), f\"{record.otype}\"))\n    two_column_items.append(Text.assemble((\"hash: \", \"dim\"), f\"{record.hash}\"))\n    two_column_items.append(\n        Text.assemble((\"size: \", \"dim\"), f\"{format_bytes(record.size)}\")\n    )\n    append_branch_space_created_at_created_by(record, two_column_items, fk_data)\n    if record.n_observations:\n        two_column_items.append(\n            Text.assemble((\"n_observations: \", \"dim\"), f\"{record.n_observations}\")\n        )\n    if record.n_files:\n        two_column_items.append(\n            Text.assemble((\"n_files: \", \"dim\"), f\"{record.n_files}\")\n        )\n    schema_name = None\n    if fk_data and \"schema\" in fk_data and fk_data[\"schema\"]:\n        schema_name = fk_data[\"schema\"][\"name\"]\n    elif record.schema_id is not None and record.schema is not None:\n        schema_name = (\n            record.schema.name\n            if record.schema.name is not None\n            else record.schema.uid[:7]\n        )\n    if schema_name is not None:\n        two_column_items.append(Text.assemble((\"schema: \", \"dim\"), schema_name))\n    add_two_column_items_to_tree(tree, two_column_items)\n    storage_root = fk_data[\"storage\"][\"name\"] if fk_data else record.storage.root\n    storage_key = (\n        record.key\n        if not record._key_is_virtual\n        else record._real_key\n        if record._real_key\n        else f\".lamindb/{record.uid}\"\n    )\n    if record.uid in storage_key:\n        if record.overwrite_versions:\n            storage_key = storage_key[:-4]\n        storage_key = f\"{storage_key}{record.suffix}\"\n    tree.add(\n        Text.assemble(\n            (\"storage/path: \", \"dim\"),\n            (storage_root, \"cyan3\"),\n            (\"/\", \"dim\"),\n            storage_key,\n        )\n    )\n    if dataset_features_tree:\n        tree.add(dataset_features_tree)\n    if external_features_tree:\n        tree.add(external_features_tree)\n    if labels_tree:\n        tree.add(labels_tree)\n    return tree\n\n\ndef describe_collection(\n    record: Collection,\n    related_data: dict | None = None,\n) -> Tree:\n    tree = describe_header(record)\n    if related_data is not None:\n        fk_data = related_data.get(\"fk\", {})\n    else:\n        fk_data = {}\n    two_column_items = []  # type: ignore\n    append_uid_run(record, two_column_items, fk_data)\n    append_branch_space_created_at_created_by(record, two_column_items, fk_data)\n    add_two_column_items_to_tree(tree, two_column_items)\n    return tree\n\n\ndef display_text(\n    text: str, title: str, tree: Tree, max_lines: int = 30, uid: str = \"\"\n) -> None:\n    # Split the code into lines and add dim vertical bars\n    lines = text.split(\"\\n\")\n    end_parts = [(\"\\n│ …\", \"grey30\")] if len(lines) > max_lines else []\n    parts = [(title + \": \", \"purple\")]\n    parts.append((uid, \"\"))\n    max_length = 80\n    for line in lines[:max_lines]:\n        parts.append((\"\\n│ \", \"dim\"))\n        parts.append((line[:max_length], \"grey30\"))\n        if len(line) > max_length:\n            parts.append((\" …\", \"grey30\"))\n    parts.extend(end_parts)\n    tree.add(Text.assemble(*parts))\n\n\ndef describe_run(\n    record: Run,\n    related_data: dict | None = None,\n) -> Tree:\n    from ._feature_manager import describe_features\n\n    tree = describe_header(record)\n    if related_data is not None:\n        fk_data = related_data.get(\"fk\", {})\n    else:\n        fk_data = {}\n    _, features_tree = describe_features(\n        record,\n        related_data=related_data,\n    )\n    two_column_items = []  # type: ignore\n    two_column_items.append(Text.assemble((\"uid: \", \"dim\"), f\"{record.uid}\"))\n    if fk_data and \"transform\" in fk_data:\n        transform = SimpleNamespace(**fk_data[\"transform\"], description=\"\")\n    else:\n        transform = record.transform\n    transform_key = transform.key if transform and transform.key is not None else \"\"\n    transform_version = (\n        f\" ({transform.version})\" if transform and transform.version is not None else \"\"\n    )\n    two_column_items.append(\n        Text.assemble(\n            (\"transform: \", \"dim\"),\n            (transform_key, \"cyan3\"),\n            (transform_version, \"dim\"),\n        )\n    )\n    two_column_items.append(\n        Text.assemble(\n            (\"started_at: \", \"dim\"), format_field_value(record.started_at, none=\"\")\n        )\n    )\n    two_column_items.append(\n        Text.assemble(\n            (\"finished_at: \", \"dim\"), format_field_value(record.finished_at, none=\"\")\n        )\n    )\n    two_column_items.append(Text.assemble((\"status: \", \"dim\"), record.status))\n    two_column_items.append(\n        Text.assemble((\"reference: \", \"dim\"), record.reference)\n        if record.reference\n        else Text(\"\")\n    )\n    append_branch_space_created_at_created_by(record, two_column_items, fk_data)\n    add_two_column_items_to_tree(tree, two_column_items)\n    if record.cli_args:\n        display_text(\n            record.cli_args.strip(),\n            \"cli_args\",\n            tree,\n            max_lines=4,\n        )\n    if record.report_id:\n        report = record.report.load(is_run_input=False)\n        if report:\n            report_str = report if isinstance(report, str) else str(report)\n            display_text(\n                strip_ansi_from_string(report_str.strip()),\n                \"report\",\n                tree,\n                max_lines=4,\n                uid=record.report.uid[:7],\n            )\n    if record.environment_id:\n        env_result = record.environment.load(is_run_input=False)\n        env_str = env_result if isinstance(env_result, str) else str(env_result)\n        display_text(\n            env_str.strip(),\n            \"environment\",\n            tree,\n            max_lines=4,\n            uid=record.environment.uid[:7],\n        )\n    if record.params:\n        params = tree.add(Text(\"Params\", style=\"bold dark_orange\"))\n        for key, value in record.params.items():\n            params.add(f\"{key}: {value}\")\n    if features_tree:\n        tree.add(features_tree)\n    return tree\n\n\ndef describe_record(\n    record: Record,\n    related_data: dict | None = None,\n) -> Tree:\n    from ._feature_manager import describe_features\n\n    tree = describe_header(record)\n    if related_data is not None:\n        fk_data = related_data.get(\"fk\", {})\n    else:\n        fk_data = {}\n    _, features_tree = describe_features(\n        record,\n        related_data=related_data,\n    )\n    two_column_items = []  # type: ignore\n    append_uid_run(record, two_column_items, fk_data)\n    type_name = (\n        fk_data[\"type\"][\"name\"]\n        if fk_data and \"type\" in fk_data and fk_data[\"type\"]\n        else record.type.name\n        if record.type_id is not None\n        else \"\"\n    )\n    if type_name is None:\n        type_name = \"\"\n    two_column_items.append(Text.assemble((\"type: \", \"dim\"), type_name))\n    two_column_items.append(Text.assemble((\"is_type: \", \"dim\"), f\"{record.is_type}\"))\n    schema_name = (\n        fk_data[\"schema\"][\"name\"]\n        if fk_data and \"schema\" in fk_data and fk_data[\"schema\"]\n        else record.schema.name\n        if record.schema_id is not None\n        else \"\"\n    )\n    if schema_name is None:\n        schema_name = \"\"\n    two_column_items.append(Text.assemble((\"schema: \", \"dim\"), schema_name))\n    reference = record.reference if record.reference is not None else \"\"\n    two_column_items.append(Text.assemble((\"reference: \", \"dim\"), reference))\n    append_branch_space_created_at_created_by(record, two_column_items, fk_data)\n    add_two_column_items_to_tree(tree, two_column_items)\n    if features_tree:\n        tree.add(features_tree)\n    return tree\n\n\ndef describe_transform(\n    record: Transform,\n    related_data: dict | None = None,\n) -> Tree:\n    tree = describe_header(record)\n    if related_data is not None:\n        fk_data = related_data.get(\"fk\", {})\n    else:\n        fk_data = {}\n    two_column_items = []  # type: ignore\n    two_column_items.append(Text.assemble((\"uid: \", \"dim\"), f\"{record.uid}\"))\n    two_column_items.append(\n        Text.assemble((\"reference: \", \"dim\"), record.reference)\n        if record.reference\n        else Text(\"\")\n    )\n    two_column_items.append(Text.assemble((\"hash: \", \"dim\"), f\"{record.hash}\"))\n    two_column_items.append(Text.assemble((\"type: \", \"dim\"), f\"{record.type}\"))\n    append_branch_space_created_at_created_by(record, two_column_items, fk_data)\n    add_two_column_items_to_tree(tree, two_column_items)\n    if record.source_code:\n        display_text(record.source_code.strip(), \"source_code\", tree)\n    return tree\n\n\ndef describe_branch(record: Branch) -> Tree:\n    tree = describe_header(record)\n    two_column_items = []  # type: ignore\n    two_column_items.append(Text.assemble((\"status: \", \"dim\"), record.status))\n    two_column_items.append(Text.assemble((\"space: \", \"dim\"), record.space.name))\n    two_column_items.append(\n        Text.assemble((\"created_at: \", \"dim\"), format_field_value(record.created_at))\n    )\n    two_column_items.append(\n        Text.assemble((\"created_by: \", \"dim\"), record.created_by.handle)\n    )\n    add_two_column_items_to_tree(tree, two_column_items)\n    return tree\n\n\ndef describe_schema(record: Schema, slot: str | None = None) -> Tree:\n    from ._feature_manager import format_dtype_for_display, strip_cat\n\n    if record.type:\n        prefix = f\" {record.type.name} · \"\n    else:\n        prefix = \" \"\n    if record.name:\n        name = record.name\n    else:\n        name = \"unnamed\"\n    header = \"Schema:\" if slot is None else f\"{slot}:\"\n    description = (\n        Text.assemble((\"\\n|   description: \", \"dim\"), record.description)\n        if record.description\n        else Text(\"\")\n    )\n    tree = Tree(\n        Text.assemble(\n            (header, \"bold\"), (f\"{prefix}\", \"dim\"), (f\"{name}\", \"cyan3\"), description\n        ),\n        guide_style=\"dim\",\n    )\n    two_column_items = []  # type: ignore\n    append_uid_run(record, two_column_items)\n    two_column_items.append(Text.assemble((\"itype: \", \"dim\"), f\"{record.itype}\"))\n    two_column_items.append(Text.assemble((\"otype: \", \"dim\"), f\"{record.otype}\"))\n    two_column_items.append(Text.assemble((\"hash: \", \"dim\"), f\"{record.hash}\"))\n    two_column_items.append(\n        Text.assemble((\"ordered_set: \", \"dim\"), f\"{record.ordered_set}\")\n    )\n    two_column_items.append(\n        Text.assemble((\"maximal_set: \", \"dim\"), f\"{record.maximal_set}\")\n    )\n    two_column_items.append(\n        Text.assemble((\"minimal_set: \", \"dim\"), f\"{record.minimal_set}\")\n    )\n    append_branch_space_created_at_created_by(record, two_column_items)\n    add_two_column_items_to_tree(tree, two_column_items)\n\n    # Add features section\n    n_members = record.n_members\n    members_count_display = f\" ({n_members})\" if n_members else \"\"\n    if n_members or (record.dtype and record.itype is not None):\n        features = tree.add(\n            Text.assemble(\n                (\n                    \"Features\" if record.itype == \"Feature\" else record.itype,\n                    \"bold bright_magenta\",\n                ),\n                (members_count_display, \"bold dim\"),\n            )\n        )\n        if n_members is not None:\n            feature_table = Table(\n                show_header=True, header_style=\"dim\", box=None, pad_edge=False\n            )\n\n            feature_table.add_column(\"name\", style=\"\", no_wrap=True)\n            feature_table.add_column(\"dtype\", style=\"\", no_wrap=True)\n            feature_table.add_column(\"optional\", style=\"\", no_wrap=True)\n            feature_table.add_column(\"nullable\", style=\"\", no_wrap=True)\n            feature_table.add_column(\"coerce\", style=\"\", no_wrap=True)\n            feature_table.add_column(\"default_value\", style=\"\", no_wrap=True)\n\n            optionals = record.optionals.get()\n            for member in record.members:\n                feature_table.add_row(\n                    Text(member.name),\n                    Text(strip_cat(format_dtype_for_display(member._dtype_str))),\n                    \"✓\" if optionals.filter(uid=member.uid).exists() else \"✗\",\n                    \"✓\" if member.nullable else \"✗\",\n                    \"✓\" if record.coerce or member.coerce else \"✗\",\n                    str(member.default_value) if member.default_value else \"unset\",\n                )\n\n            features.add(feature_table)\n        elif record.dtype:\n            features.add(Text.assemble((\"dtype: \", \"dim\"), f\"{record.dtype}\"))\n\n    return tree\n\n\ndef describe_postgres(record):\n    from ._django import get_artifact_or_run_with_related, get_collection_with_related\n\n    model_name = record.__class__.__name__\n    msg = f\"{colors.green(model_name)}{record.__repr__(include_foreign_keys=False).lstrip(model_name)}\\n\"\n    if record._state.db is not None and record._state.db != \"default\":\n        msg += f\"  {colors.italic('Database instance')}\\n\"\n        msg += f\"    slug: {record._state.db}\\n\"\n    if model_name in {\"Artifact\", \"Run\"}:\n        result = get_artifact_or_run_with_related(\n            record,\n            include_feature_link=True,\n            include_fk=True,\n            include_m2m=True,\n            include_schema=True,\n        )\n        related_data = result.get(\"related_data\", {})\n        if model_name == \"Artifact\":\n            tree = describe_artifact(record, related_data=related_data)\n        else:\n            tree = describe_run(record, related_data=related_data)\n    elif model_name == \"Record\":\n        result = get_artifact_or_run_with_related(\n            record,\n            include_feature_link=True,\n            include_fk=True,\n        )\n        related_data = result.get(\"related_data\", {})\n        tree = describe_record(record, related_data=related_data)\n    elif model_name == \"Collection\":\n        result = get_collection_with_related(record, include_fk=True)\n        related_data = result.get(\"related_data\", {})\n        tree = describe_collection(record, related_data=related_data)\n    elif model_name == \"Transform\":\n        tree = describe_transform(record)\n    elif model_name == \"Branch\":\n        tree = describe_branch(record)\n    else:\n        tree = describe_header(record)\n    return tree\n\n\ndef describe_sqlite(record):\n    model_name = record.__class__.__name__\n    msg = f\"{colors.green(model_name)}{record.__repr__(include_foreign_keys=False).lstrip(model_name)}\\n\"\n    if record._state.db is not None and record._state.db != \"default\":\n        msg += f\"  {colors.italic('Database instance')}\\n\"\n        msg += f\"    slug: {record._state.db}\\n\"\n\n    fields = record._meta.fields\n    direct_fields = []\n    foreign_key_fields = []\n    for f in fields:\n        if f.is_relation:\n            foreign_key_fields.append(f.name)\n        else:\n            direct_fields.append(f.name)\n    if not record._state.adding:\n        # prefetch foreign key relationships\n        record = (\n            record.__class__.objects.using(record._state.db)\n            .select_related(*foreign_key_fields)\n            .get(id=record.id)\n        )\n        # prefetch m-2-m relationships\n        many_to_many_fields = []\n        if model_name in {\"Artifact\", \"Collection\"}:\n            many_to_many_fields.append(\"input_of_runs\")\n        if model_name == \"Artifact\":\n            many_to_many_fields.append(\"schemas\")\n        record = (\n            record.__class__.objects.using(record._state.db)\n            .prefetch_related(*many_to_many_fields)\n            .get(id=record.id)\n        )\n    if model_name in {\"Artifact\", \"Run\", \"Record\"}:\n        if model_name == \"Artifact\":\n            tree = describe_artifact(record)\n        elif model_name == \"Run\":\n            tree = describe_run(record)\n        else:\n            tree = describe_record(record)\n    elif model_name == \"Collection\":\n        tree = describe_collection(record)\n    elif model_name == \"Transform\":\n        tree = describe_transform(record)\n    elif model_name == \"Branch\":\n        tree = describe_branch(record)\n    else:\n        tree = describe_header(record)\n    return tree\n\n\ndef append_readme_blocks_to_tree(\n    record, tree: Tree, include: None | Literal[\"comments\"] = None\n) -> None:\n    \"\"\"Append readme (and optionally comment) block content to the describe tree.\"\"\"\n    if record._state.adding:\n        return\n    if not hasattr(record, \"ablocks\"):\n        return\n    if include == \"comments\":\n        blocks_qs = record.ablocks.filter(\n            Q(kind=\"readme\", is_latest=True) | Q(kind=\"comment\")\n        ).select_related(\"created_by\")\n    else:\n        blocks_qs = record.ablocks.filter(kind=\"readme\", is_latest=True)\n    blocks = list(blocks_qs.order_by(\"created_at\"))\n    # README first, then comments; each group sorted chronologically\n    readme_blocks = [b for b in blocks if b.kind == \"readme\"]\n    comment_blocks = [b for b in blocks if b.kind == \"comment\"]\n    for block in readme_blocks + comment_blocks:\n        if block.kind == \"readme\":\n            title = \"README\"\n        else:\n            handle = block.created_by.handle if block.created_by else \"?\"\n            created_at_str = format_field_value(block.created_at)\n            title = f\"comment by {handle} at {created_at_str}\"\n        display_text(\n            block.content,\n            title,\n            tree,\n            max_lines=30,\n            uid=\"\",\n        )\n\n\ndef describe_postgres_sqlite(\n    record,\n    return_str: bool = False,\n    include: None | Literal[\"comments\"] = None,\n) -> str | None:\n    from ._describe import format_rich_tree\n\n    if (\n        not record._state.adding\n        and connections[record._state.db].vendor == \"postgresql\"\n    ):\n        tree = describe_postgres(record)\n    else:\n        tree = describe_sqlite(record)\n    append_readme_blocks_to_tree(record, tree, include=include)\n    return format_rich_tree(tree, return_str=return_str)\n"
  },
  {
    "path": "lamindb/models/_django.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any\n\nfrom django.contrib.postgres.aggregates import ArrayAgg\nfrom django.db import connection\nfrom django.db.models import CharField, F, OuterRef, Q, Subquery\nfrom django.db.models.fields.related import ForeignKey, ManyToManyField\nfrom django.db.models.fields.reverse_related import ManyToManyRel, ManyToOneRel\nfrom django.db.models.functions import JSONObject\n\nfrom ._relations import dict_related_model_to_related_name, get_schema_modules\nfrom .schema import Schema\n\nif TYPE_CHECKING:\n    from .artifact import Artifact, Collection\n    from .record import Record\n    from .run import Run\n\n\ndef patch_many_to_many_descriptor() -> None:\n    \"\"\"Patches Django's `ManyToManyDescriptor.__get__` method to suggest better errors when saving relationships of an unsaved model.\n\n    Before this patch: Cryptic errors are raised when relationships of an unsaved record are attempted to be modified.\n\n    After this patch: Attempts to access M2M relationships on unsaved objects will raise ValueError,\n    suggesting explicit .save() of the record to be modified before relationship creation.\n    \"\"\"\n    from django.db.models.fields.related_descriptors import ManyToManyDescriptor\n\n    original_get = ManyToManyDescriptor.__get__\n\n    def patched_get(self, instance, cls=None):\n        if instance is not None and instance.pk is None:\n            raise ValueError(\n                f\"You are trying to access the many-to-many relationships of an unsaved {instance.__class__.__name__} object. \"\n                f\"Please save it first using '.save()'.\"\n            )\n\n        manager = original_get(self, instance, cls)\n        if manager is None or not hasattr(manager, \"add\"):\n            return manager\n\n        original_manager_add = manager.add\n\n        def patched_manager_add(*objs, **kwargs):\n            try:\n                return original_manager_add(*objs, **kwargs)\n            except ValueError as e:\n                if \"Cannot add\" in str(e) and \"database\" in str(e):\n                    source_db = manager.instance._state.db\n\n                    raise ValueError(\n                        f\"Cannot label a record from instance '{source_db}'. \"\n                        f\"Please save the record first to your instance using '.save()'.\"\n                    ) from None\n                raise\n\n        manager.add = patched_manager_add\n        return manager\n\n    ManyToManyDescriptor.__get__ = patched_get\n\n\ndef get_related_model(model, field_name):\n    try:\n        field = model._meta.get_field(field_name)\n\n        if isinstance(field, (ForeignKey, ManyToManyField)):\n            # Forward ForeignKey or ManyToManyField\n            return field.remote_field.model\n        elif isinstance(field, (ManyToOneRel, ManyToManyRel)):\n            # Reverse ForeignKey or ManyToManyField\n            return field.related_model\n        else:\n            return f\"Unexpected field type: {type(field)}\"\n    except Exception as e:\n        return f\"Error: {str(e)}\"\n\n\ndef get_artifact_or_run_with_related(\n    record: Artifact | Run | Record,\n    include_fk: bool = False,\n    include_m2m: bool = False,\n    include_feature_link: bool = False,\n    include_schema: bool = False,\n) -> dict[str, Any]:\n    \"\"\"Fetch an artifact with its related data.\"\"\"\n    from ._label_manager import EXCLUDE_LABELS\n    from .can_curate import get_name_field\n    from .query_set import get_default_branch_ids\n\n    model = record.__class__\n    is_record = record.__class__.__name__ == \"Record\"\n    is_artifact = record.__class__.__name__ == \"Artifact\"\n    entity_field_name = record.__class__.__name__.lower()\n    if entity_field_name in {\"run\", \"record\"} and include_schema:\n        include_schema = False  # runs do not have feature sets\n    schema_modules = get_schema_modules(record._state.db)\n\n    foreign_key_fields = [\n        f.name\n        for f in model._meta.fields\n        if f.is_relation and f.related_model.__get_module_name__() in schema_modules\n    ]\n\n    # Create the map that the conversion function will need.\n    # It maps the target model class to the m2m field name, e.g.,\n    # {'Ulabel': 'ulabels', 'CellType': 'cell_types'}\n    m2m_model_to_field_map = {}\n    if include_m2m:\n        full_map = dict_related_model_to_related_name(model, instance=record._state.db)\n        m2m_model_to_field_map = {\n            model_cls: field_name\n            for model_cls, field_name in full_map.items()\n            if not field_name.startswith(\"_\") and field_name not in EXCLUDE_LABELS\n        }\n        if is_record:\n            m2m_model_to_field_map[\"Run\"] = \"linked_runs\"\n        else:\n            m2m_model_to_field_map[\"Run\"] = \"runs\"\n    link_tables = (\n        []\n        if not include_feature_link\n        else list(\n            dict_related_model_to_related_name(\n                model, links=True, instance=record._state.db\n            ).values()\n        )\n    )\n\n    # Clear previous queries\n    connection.queries_log.clear()\n\n    annotations = {}\n\n    if include_fk:\n        for fk in foreign_key_fields:\n            name_field = get_name_field(get_related_model(model, fk))\n            if fk == \"run\":\n                annotations[f\"fkfield_{fk}\"] = JSONObject(\n                    id=F(f\"{fk}__id\"),\n                    name=F(f\"{fk}__name\"),\n                    uid=F(f\"{fk}__uid\"),\n                    transform_key=F(f\"{fk}__transform__key\"),\n                )\n            elif fk == \"transform\":\n                annotations[f\"fkfield_{fk}\"] = JSONObject(\n                    id=F(f\"{fk}__id\"),\n                    key=F(f\"{fk}__key\"),\n                    uid=F(f\"{fk}__uid\"),\n                    version=F(f\"{fk}__version_tag\"),\n                )\n            elif fk == \"created_by\":\n                annotations[f\"fkfield_{fk}\"] = JSONObject(\n                    id=F(f\"{fk}__id\"), name=F(f\"{fk}__{name_field}\")\n                )\n            else:\n                annotations[f\"fkfield_{fk}\"] = JSONObject(\n                    id=F(f\"{fk}__id\"), name=F(f\"{fk}__{name_field}\")\n                )\n\n    for link in link_tables:\n        link_model = getattr(model, link).rel.related_model\n        if not hasattr(link_model, \"feature\"):\n            continue\n        if not is_record and link_model.__name__ in {\n            \"RecordArtifact\",\n            \"RecordRun\",\n        }:\n            continue\n        if is_record and (\n            not link_model.__name__.startswith(\"Record\")\n            or link_model.__name__\n            in {\n                \"RecordJson\",\n            }\n        ):\n            continue\n        if not is_record and not link_model.__name__ == \"ArtifactArtifact\":\n            if link_model.__name__ == \"RunArtifact\":\n                if is_artifact:\n                    continue\n                else:\n                    label_field = \"artifact\"\n            else:\n                label_field = link.removeprefix(\"links_\").replace(\"_\", \"\")\n        else:\n            label_field = \"value\"\n        related_model = link_model._meta.get_field(label_field).related_model\n        # manually include \"name\" as pertdb.Compound.name is a TextField due to no length limitation\n        char_field_names = [\n            field.name\n            for field in related_model._meta.concrete_fields\n            if isinstance(field, CharField) or field.name == \"name\"\n        ]\n        name_field = get_name_field(related_model)\n        label_field_name = f\"{label_field}__{name_field}\"\n        filter_kwargs = {entity_field_name: OuterRef(\"pk\")}\n        if link_model.__name__ not in {\n            \"RecordUser\",\n            \"ArtifactUser\",\n        }:  # user does not have branch\n            filter_kwargs[f\"{label_field}__branch_id__in\"] = get_default_branch_ids()\n        annotations[f\"linkfield_{link}\"] = Subquery(\n            link_model.objects.filter(**filter_kwargs)\n            .annotate(\n                data=JSONObject(\n                    id=F(\"id\"),\n                    feature=F(\"feature\"),\n                    **{label_field: F(label_field)},\n                    **{\n                        label_field + \"_display\": F(label_field_name)\n                    },  # display field is the name field\n                    **{uf: F(f\"{label_field}__{uf}\") for uf in char_field_names},\n                )\n            )\n            .values(entity_field_name)\n            .annotate(json_agg=ArrayAgg(\"data\"))\n            .values(\"json_agg\")\n        )\n\n    if include_schema:\n        annotations[\"m2m_schemas\"] = Subquery(\n            model.schemas.through.objects.filter(artifact=OuterRef(\"pk\"))\n            .annotate(\n                data=JSONObject(\n                    id=F(\"id\"),\n                    slot=F(\"slot\"),\n                    schema=F(\"schema\"),\n                )\n            )\n            .values(entity_field_name)\n            .annotate(json_agg=ArrayAgg(\"data\"))\n            .values(\"json_agg\")\n        )\n\n    record_meta = (\n        model.objects.using(record._state.db)\n        .filter(uid=record.uid)\n        .annotate(**annotations)\n        .values(*[\"id\", \"uid\"], *annotations.keys())\n        .first()\n    )\n\n    if not record_meta:\n        return None\n\n    related_data: dict = {\"m2m\": {}, \"fk\": {}, \"link\": {}, \"m2m_schemas\": {}}\n    for k, v in record_meta.items():\n        if k.startswith(\"fkfield_\") and v is not None:\n            related_data[\"fk\"][k[8:]] = v\n        elif k.startswith(\"linkfield_\") and v is not None:\n            related_data[\"link\"][k[10:]] = v\n        elif k == \"m2m_schemas\":\n            if v:\n                related_data[\"m2m_schemas\"] = get_schema_m2m_relations(\n                    record, {i[\"schema\"]: i[\"slot\"] for i in v}\n                )\n\n    def convert_link_data_to_m2m(\n        link_data: dict,\n        model,  # The main artifact model class is still needed for introspection\n        m2m_model_map: dict,  # The pre-computed map from Step 1\n    ) -> dict:\n        \"\"\"Converts link data to M2M-style data using a pre-computed model-to-field-name map.\"\"\"\n        # link_data: {'links_tissue': [{'id': 1, 'uid': '1fIFAQJY', 'abbr': None, 'name': 'brain', 'tissue': 1, 'feature': 1, 'ontology_id': 'UBERON:0000955', 'tissue_display': 'brain'}, {'id': 2, 'uid': '7Tt4iEKc', 'abbr': None, 'name': 'lung', 'tissue': 10, 'feature': 1, 'ontology_id': 'UBERON:0002048', 'tissue_display': 'lung'}], 'links_cell_type': [{'id': 1, 'uid': '3QnZfoBk', 'abbr': None, 'name': 'neuron', 'feature': 2, 'celltype': 1, 'ontology_id': 'CL:0000540', 'celltype_display': 'neuron'}]}\n        m2m_data = {}\n        for link_name, records in link_data.items():\n            if not records:\n                continue\n            link_model = getattr(model, link_name).rel.related_model\n            if not is_record:\n                id_field_name = link_name.removeprefix(\"links_\").replace(\"_\", \"\")\n            else:\n                id_field_name = \"value\"\n            final_target_model = link_model._meta.get_field(id_field_name).related_model\n            m2m_field_name = m2m_model_map.get(\n                final_target_model.__get_name_with_module__()\n            )\n            m2m_data[m2m_field_name] = {\n                record[id_field_name]: record for record in records\n            }\n        return m2m_data\n\n    related_data[\"m2m\"] = convert_link_data_to_m2m(\n        related_data[\"link\"], model=model, m2m_model_map=m2m_model_to_field_map\n    )\n    return {\n        **{name: record_meta[name] for name in [\"id\", \"uid\"]},\n        \"related_data\": related_data,\n    }\n\n\ndef get_collection_with_related(\n    collection: Collection,\n    include_fk: bool = False,\n) -> dict[str, Any]:\n    \"\"\"Fetch a collection with its related data.\"\"\"\n    from .can_curate import get_name_field\n\n    model = collection.__class__\n    schema_modules = get_schema_modules(collection._state.db)\n\n    foreign_key_fields = [\n        f.name\n        for f in model._meta.fields\n        if f.is_relation and f.related_model.__get_module_name__() in schema_modules\n    ]\n\n    # Clear previous queries\n    connection.queries_log.clear()\n\n    annotations = {}\n\n    if include_fk:\n        for fk in foreign_key_fields:\n            name_field = get_name_field(get_related_model(model, fk))\n            if fk == \"run\":\n                annotations[f\"fkfield_{fk}\"] = JSONObject(\n                    id=F(f\"{fk}__id\"),\n                    name=F(f\"{fk}__{name_field}\"),\n                    transform_key=F(f\"{fk}__transform__key\"),\n                )\n            else:\n                annotations[f\"fkfield_{fk}\"] = JSONObject(\n                    id=F(f\"{fk}__id\"), name=F(f\"{fk}__{name_field}\")\n                )\n\n    collection_meta = (\n        model.objects.using(collection._state.db)\n        .filter(uid=collection.uid)\n        .annotate(**annotations)\n        .values(*[\"id\", \"uid\"], *annotations.keys())\n        .first()\n    )\n\n    if not collection_meta:\n        return None\n\n    related_data: dict = {\"fk\": {}}\n    for k, v in collection_meta.items():\n        if k.startswith(\"fkfield_\") and v is not None:\n            related_data[\"fk\"][k[8:]] = v\n\n    return {\n        **{name: collection_meta[name] for name in [\"id\", \"uid\"]},\n        \"related_data\": related_data,\n    }\n\n\ndef get_schema_m2m_relations(artifact: Artifact, slot_schema: dict, limit: int = 20):\n    \"\"\"Fetch all many-to-many relationships for given feature sets.\"\"\"\n    from .can_curate import get_name_field\n\n    m2m_relations = [\n        v\n        for v in dict_related_model_to_related_name(Schema).values()\n        if v is not None and not v.startswith(\"_\") and v != \"artifacts\"\n    ]\n\n    annotations = {}\n    related_names = {}\n    for name in m2m_relations:\n        related_model = get_related_model(Schema, name)\n        if related_model is Schema:\n            # this is for the `type` field\n            continue\n        name_field = get_name_field(related_model)\n\n        # Get the correct field names for the through table\n        if not hasattr(getattr(Schema, name), \"through\"):\n            continue\n        through_model = getattr(Schema, name).through\n\n        # Subquery to get limited related records\n        limited_related = Subquery(\n            through_model.objects.filter(schema=OuterRef(\"pk\")).values(\n                related_model.__name__.lower()\n            )[:limit]\n        )\n\n        annotations[f\"m2mfield_{name}\"] = ArrayAgg(\n            JSONObject(id=F(f\"{name}__id\"), name=F(f\"{name}__{name_field}\")),\n            filter=Q(\n                **{\n                    f\"{name}__id__in\": limited_related,\n                }\n            ),\n            distinct=True,\n        )\n        related_names[name] = related_model.__get_name_with_module__()\n\n    schema_m2m = (\n        Schema.connect(artifact._state.db)\n        .filter(id__in=slot_schema.keys())\n        .annotate(**annotations)\n        .values(\"id\", *annotations.keys())\n    )\n\n    result = {}\n    for fs in schema_m2m:\n        slot = slot_schema.get(fs[\"id\"])\n        result[fs[\"id\"]] = (\n            slot,\n            {\n                related_names.get(k[9:]): [item[\"name\"] for item in v]\n                for k, v in fs.items()\n                if k.startswith(\"m2mfield_\") and v\n            },\n        )\n\n    return result\n\n\npatch_many_to_many_descriptor()\n"
  },
  {
    "path": "lamindb/models/_feature_manager.py",
    "content": "# ruff: noqa: TC004\nfrom __future__ import annotations\n\nfrom collections import defaultdict\nfrom collections.abc import Iterable\nfrom datetime import date, datetime\nfrom itertools import compress\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any\n\nimport numpy as np\nfrom django.contrib.postgres.aggregates import ArrayAgg\nfrom django.db import connections\nfrom django.db.models import Aggregate, Subquery\nfrom django.db.models.expressions import RawSQL\nfrom django.db.utils import IntegrityError\nfrom lamin_utils import logger\nfrom lamindb_setup.core.upath import UPath\nfrom lamindb_setup.errors import ModuleWasntConfigured\nfrom rich.table import Column, Table\nfrom rich.text import Text\nfrom rich.tree import Tree\n\nfrom lamindb.errors import DoesNotExist, InvalidArgument, ValidationError\nfrom lamindb.models._from_values import _format_values\nfrom lamindb.models.feature import (\n    serialize_pandas_dtype,\n    suggest_categorical_for_str_iterable,\n)\nfrom lamindb.models.has_parents import keep_topmost_matches\nfrom lamindb.models.save import save\nfrom lamindb.models.schema import DICT_KEYS_TYPE, Schema\nfrom lamindb.models.sqlrecord import (\n    REGISTRY_UNIQUE_FIELD,\n    get_name_field,\n    transfer_fk_to_default_db_bulk,\n    transfer_to_default_db,\n)\n\nfrom ._describe import (\n    NAME_WIDTH,\n    TYPE_WIDTH,\n    VALUES_WIDTH,\n    describe_header,\n    format_rich_tree,\n)\nfrom ._django import get_artifact_or_run_with_related\nfrom ._label_manager import _get_labels\nfrom ._relations import (\n    dict_related_model_to_related_name,\n)\nfrom .feature import Feature, FeaturePredicate, JsonValue, parse_dtype\nfrom .sqlrecord import SQLRecord\nfrom .ulabel import ULabel\n\nif TYPE_CHECKING:\n    from rich.tree import Tree\n\n    from lamindb.base.types import FieldAttr\n    from lamindb.models import (\n        Artifact,\n        Collection,\n        IsLink,\n    )\n    from lamindb.models.query_set import BasicQuerySet, SQLRecordList\n\n    from ..base.types import DtypeObject\n    from .record import Record\n    from .run import Run\n\n\ndef get_accessor_by_registry_(host: Artifact | Collection) -> dict:\n    dictionary = {\n        field.related_model.__get_name_with_module__(): field.name\n        for field in host._meta.related_objects\n    }\n    dictionary[\"Feature\"] = \"features\"\n    dictionary[\"ULabel\"] = \"ulabels\"\n    dictionary[\"Record\"] = \"records\"\n    return dictionary\n\n\ndef get_schema_by_slot_(host: Artifact) -> dict[str, Schema]:\n    # if the host is not yet saved\n    if host._state.adding:\n        if hasattr(host, \"_staged_schemas\"):\n            return host._staged_schemas\n        else:\n            return {}\n    host_db = host._state.db\n    kwargs = {\"artifact_id\": host.id}\n    # otherwise, we need a query\n    links_schema = (\n        host.schemas.through.objects.using(host_db)\n        .filter(**kwargs)\n        .select_related(\"schema\")\n    )\n    return {fsl.slot: fsl.schema for fsl in links_schema}\n\n\ndef get_label_links(\n    host: Artifact | Collection, registry: str, feature: Feature\n) -> BasicQuerySet:\n    kwargs = {\"artifact_id\": host.id, \"feature_id\": feature.id}\n    link_records = (\n        getattr(host, host.features._accessor_by_registry[registry])  # type: ignore\n        .through.objects.using(host._state.db)\n        .filter(**kwargs)\n    )\n    return link_records\n\n\ndef get_schema_links(host: Artifact | Collection) -> BasicQuerySet:\n    kwargs = {\"artifact_id\": host.id}\n    links_schema = host.schemas.through.objects.filter(**kwargs)\n    return links_schema\n\n\ndef get_link_attr(\n    link: IsLink | type[IsLink],\n    data: Artifact | Collection | Run | type,\n) -> str:\n    link_model_name = link.__class__.__name__\n    if link_model_name in {\"Registry\", \"ModelBase\"}:  # we passed the type of the link\n        link_model_name = link.__name__  # type: ignore\n    if link_model_name.startswith(\"Record\") or link_model_name == \"ArtifactArtifact\":\n        return \"value\"\n    host_name = data.__name__ if isinstance(data, type) else data.__class__.__name__\n    return link_model_name.replace(host_name, \"\").lower()\n\n\ndef get_categorical_link_info(\n    host_class: type[SQLRecord],\n    label_registry: type[SQLRecord],\n    instance: str | None = None,\n) -> tuple[type[SQLRecord], str, str]:\n    \"\"\"Resolve (link_model, value_field_name, filter_accessor_name) for (host_class, label_registry).\n\n    Used by filter_base (categorical path) and _add_label_feature_links.\n    \"\"\"\n    host_name = host_class.__name__.lower()\n\n    if host_name == \"record\":\n        d = dict_related_model_to_related_name(\n            host_class, links=True, instance=instance\n        )\n        for rel in host_class._meta.related_objects:\n            link_model = rel.related_model\n            key = link_model.__get_name_with_module__()\n            if key not in d:\n                continue\n            if not hasattr(link_model, \"feature_id\") or not hasattr(\n                link_model, \"value\"\n            ):\n                continue\n            value_fk = link_model._meta.get_field(\"value\")\n            if (\n                value_fk.remote_field is None\n                or value_fk.remote_field.model != label_registry\n            ):\n                continue\n            accessor = d[key]\n            return (link_model, \"value\", accessor)\n        raise ValueError(\n            f\"No categorical link model for Record + {label_registry.__name__}. \"\n            \"Ensure the label registry has a Record* link model (e.g. RecordRecord, RecordULabel) \"\n            \"or a bionty link model (e.g. RecordCellLine) in loaded schema modules.\"\n        )\n\n    # Artifact, Run, or Collection\n    attr_map = {\n        \"artifact\": \"artifacts\",\n        \"run\": \"runs\",\n        \"collection\": \"collections\",\n    }\n    attr = attr_map.get(host_name)\n    if not attr or not hasattr(label_registry, attr):\n        raise ValueError(\n            f\"{label_registry.__name__} has no {attr or host_name!r} relation; \"\n            \"cannot resolve categorical link for this host.\"\n        )\n    through = getattr(label_registry, attr).through\n    link_model = through\n    host_fk = host_name  # \"artifact\", \"run\", \"collection\"\n    value_field = get_link_attr(link_model, host_class)\n    filter_accessor = getattr(link_model, host_fk).field._related_name\n    return (link_model, value_field, filter_accessor)\n\n\ndef strip_cat(feature_dtype: str) -> str:\n    if \"cat[\" in feature_dtype:\n        parts = feature_dtype.split(\"cat[\")\n        dtype_stripped_cat = \"\".join(\n            part[:-1] if i != 0 else part for i, part in enumerate(parts)\n        )\n    else:\n        dtype_stripped_cat = feature_dtype\n    return dtype_stripped_cat\n\n\ndef format_dtype_for_display(dtype_str: str) -> str:\n    \"\"\"Format dtype string for display, replacing Record[uid] or ULabel[uid] with Record[TypeName] or ULabel[TypeName].\"\"\"\n    from .feature import parse_dtype\n    from .record import Record\n    from .ulabel import ULabel\n\n    # Check if this is a Record[uid] or ULabel[uid] format\n    if (\"Record[\" in dtype_str or \"ULabel[\" in dtype_str) and \"]\" in dtype_str:\n        try:\n            parsed = parse_dtype(dtype_str)\n            if parsed and parsed[0].get(\"record_uid\"):\n                record_uid = parsed[0][\"record_uid\"]\n                registry_str = parsed[0].get(\"registry_str\", \"\")\n                try:\n                    # Determine which registry to use\n                    if registry_str == \"Record\":\n                        record_type = Record.get(uid=record_uid)\n                        # Replace Record[uid] with Record[TypeName]\n                        dtype_str = dtype_str.replace(\n                            f\"Record[{record_uid}]\", f\"Record[{record_type.name}]\"\n                        )\n                    elif registry_str == \"ULabel\":\n                        record_type = ULabel.get(uid=record_uid)\n                        # Replace ULabel[uid] with ULabel[TypeName]\n                        dtype_str = dtype_str.replace(\n                            f\"ULabel[{record_uid}]\", f\"ULabel[{record_type.name}]\"\n                        )\n                except Exception as e:\n                    # If we can't find the record, just return the original\n                    logger.debug(\n                        f\"Could not find {registry_str} with uid '{record_uid}' for display formatting: {e}\"\n                    )\n        except Exception as e:\n            # If parsing fails, return the original\n            logger.debug(\n                f\"Could not parse dtype string '{dtype_str}' for display formatting: {e}\"\n            )\n    return dtype_str\n\n\n# Custom aggregation for SQLite\nclass GroupConcat(Aggregate):\n    function = \"GROUP_CONCAT\"\n    template = '%(function)s(%(expressions)s, \", \")'\n\n\ndef custom_aggregate(field, using: str):\n    if connections[using].vendor == \"postgresql\":\n        return ArrayAgg(field)\n    else:\n        return GroupConcat(field)\n\n\ndef get_categoricals_postgres(\n    self: Artifact | Collection | Run,\n    related_data: dict | None = None,\n) -> dict[tuple[str, str], set[str]]:\n    \"\"\"Get categorical features and their values using PostgreSQL-specific optimizations.\"\"\"\n    if related_data is None:\n        if self.__class__.__name__ in {\"Artifact\", \"Run\", \"Record\"}:\n            artifact_meta = get_artifact_or_run_with_related(\n                self, include_feature_link=True, include_m2m=True\n            )\n            related_data = artifact_meta.get(\"related_data\", {})\n        else:\n            related_data = {}\n\n    # Process m2m data\n    m2m_data = related_data.get(\"m2m\", {}) if related_data else {}\n    # e.g. m2m_data = {'tissues': {1: {'id': 1, 'uid': '1fIFAQJY', 'abbr': None, 'name': 'brain', 'tissue': 1, 'feature': 1, 'ontology_id': 'UBERON:0000955', 'tissue_display': 'brain'}, 10: {'id': 2, 'uid': '7Tt4iEKc', 'abbr': None, 'name': 'lung', 'tissue': 10, 'feature': 1, 'ontology_id': 'UBERON:0002048', 'tissue_display': 'lung'}}, 'cell_types': {1: {'id': 1, 'uid': '3QnZfoBk', 'abbr': None, 'name': 'neuron', 'feature': 2, 'celltype': 1, 'ontology_id': 'CL:0000540', 'celltype_display': 'neuron'}}}\n    # e.g. {'tissue': {1: {'id': 1, 'uid': '1fIFAQJY', 'abbr': None, 'name': 'brain', 'tissue': 1, 'feature': 1, 'ontology_id': 'UBERON:0000955', 'tissue_display': 'brain'}, 10: {'id': 2, 'uid': '7Tt4iEKc', 'abbr': None, 'name': 'lung', 'tissue': 10, 'feature': 1, 'ontology_id': 'UBERON:0002048', 'tissue_display': 'lung'}}, 'celltype': {1: {'id': 1, 'uid': '3QnZfoBk', 'abbr': None, 'name': 'neuron', 'feature': 2, 'celltype': 1, 'ontology_id': 'CL:0000540', 'celltype_display': 'neuron'}}}\n    # integers are the ids of the related labels\n    m2m_name = {}\n    if not self.__class__.__name__ == \"Record\":\n        for related_name, values in m2m_data.items():\n            link_model = getattr(self.__class__, related_name).through\n            related_model_name = link_model.__name__.replace(\n                self.__class__.__name__, \"\", 1\n            ).lower()\n            if related_model_name == \"artifact\":\n                related_model_name = \"value\"\n            m2m_name[related_model_name] = values\n    else:\n        m2m_name = related_data.get(\"m2m\", {})\n\n    # Get feature information\n    links_data = related_data.get(\"link\", {}) if related_data else {}\n    # e.g. feature_dict = {1: ('tissue', 'cat[bionty.Tissue.ontology_id]'), 2: ('cell_type', 'cat[bionty.CellType]')}\n    feature_dict = {\n        id: (name, dtype)\n        for id, name, dtype in Feature.connect(self._state.db).values_list(\n            \"id\", \"name\", \"_dtype_str\"\n        )\n    }\n\n    # Build result dictionary\n    result = {}  # type: ignore\n    for link_name, link_values in links_data.items():\n        related_name = link_name.removeprefix(\"links_\").replace(\"_\", \"\")\n        if not link_values:\n            continue\n        # sort by the order on the link table, important for list dtypes\n        for link_value in sorted(link_values, key=lambda x: x.get(\"id\")):\n            feature_id = link_value.get(\"feature\")\n            if feature_id is None:\n                continue\n            feature_name, feature_dtype = feature_dict.get(feature_id)\n            feature_field = parse_dtype(feature_dtype)[0][\"field_str\"]\n            if not self.__class__.__name__ == \"Record\":\n                label_id = link_value.get(related_name)\n                label_name = (\n                    m2m_name.get(related_name, {}).get(label_id, {}).get(feature_field)\n                )\n            else:\n                label_name = link_value.get(feature_field)\n            if label_name:\n                dict_key = (feature_name, feature_dtype)\n                if dict_key not in result:\n                    result[dict_key] = (\n                        set() if not feature_dtype.startswith(\"list[cat\") else []\n                    )\n                if feature_dtype.startswith(\"list[cat\"):\n                    result[dict_key].append(label_name)\n                else:\n                    result[dict_key].add(label_name)\n    return dict(result)\n\n\ndef get_categoricals_sqlite(\n    self: Artifact | Collection,\n) -> dict[tuple[str, str], set[str]]:\n    \"\"\"Get categorical features and their values using the default approach.\"\"\"\n    from .query_set import get_default_branch_ids\n\n    result = {}  # type: ignore\n    for _, links in _get_labels(self, links=True, instance=self._state.db).items():\n        for link in links:\n            if link.__class__.__name__ == \"RecordJson\":\n                continue\n            if hasattr(link, \"feature_id\") and link.feature_id is not None:\n                feature = Feature.objects.using(self._state.db).get(id=link.feature_id)\n                dtype_str = feature._dtype_str\n                feature_field = parse_dtype(dtype_str)[0][\"field_str\"]\n                link_attr = get_link_attr(link, self)\n                label = getattr(link, link_attr)\n                if hasattr(label, \"branch_id\"):\n                    if label.branch_id not in get_default_branch_ids():\n                        continue\n                label_name = getattr(label, feature_field)\n                dict_key = (feature.name, dtype_str)\n                if dict_key not in result:\n                    result[dict_key] = (\n                        set() if not dtype_str.startswith(\"list[cat\") else []\n                    )\n                if dtype_str.startswith(\"list[cat\"):\n                    result[dict_key].append(label_name)\n                else:\n                    result[dict_key].add(label_name)\n    return dict(result)\n\n\ndef get_non_categoricals(\n    self,\n) -> dict[tuple[str, str], set[Any]]:\n    \"\"\"Get non-categorical features and their values.\"\"\"\n    import pandas as pd\n\n    from .artifact import Artifact\n    from .record import Record\n    from .run import Run\n\n    non_categoricals = {}\n\n    if self.id is not None and isinstance(self, (Artifact, Run, Record)):\n        if isinstance(self, Record):\n            json_values = self.values_json.values(\n                \"feature__name\", \"feature___dtype_str\", \"value\"\n            ).order_by(\"feature__name\")\n        else:\n            json_values = (\n                self.json_values.values(\"feature__name\", \"feature___dtype_str\")\n                .annotate(values=custom_aggregate(\"value\", self._state.db))\n                .order_by(\"feature__name\")\n            )\n\n        for fv in json_values:\n            feature_name = fv[\"feature__name\"]\n            feature_dtype = fv[\"feature___dtype_str\"]\n            if isinstance(self, Record):\n                values = fv[\"value\"]\n            else:\n                values = fv[\"values\"]\n\n            if connections[self._state.db].vendor == \"sqlite\":\n                # undo GROUP_CONCAT\n                if isinstance(values, str):\n                    values = {value.strip('\"') for value in values.split(\", \")}\n\n            # Convert single values to sets\n            if not isinstance(values, (list, dict, set)):\n                values = {values}\n            elif (\n                isinstance(values, list)\n                and feature_dtype != \"dict\"\n                and not feature_dtype.startswith(\"list\")\n            ):\n                try:\n                    values = set(values)\n                except TypeError:\n                    # TypeError: unhashable type: 'list' if values is list[list]\n                    pass\n\n            # Handle special datetime types\n            if feature_dtype == \"datetime\":\n                values = {datetime.fromisoformat(value) for value in values}\n            if feature_dtype == \"date\":\n                # date.fromisoformat() cannot handle cases like 2025-01-17T00:00:00.000Z\n                values = {\n                    pd.to_datetime(value, format=\"ISO8601\").date() for value in values\n                }\n            if connections[self._state.db].vendor == \"sqlite\":\n                # undo GROUP_CONCAT\n                if feature_dtype == \"int\":\n                    values = {int(value) for value in values}\n                if feature_dtype == \"float\":\n                    values = {float(value) for value in values}\n                if feature_dtype == \"num\":\n                    values = {float(value) for value in values}\n\n            non_categoricals[(feature_name, feature_dtype)] = values\n\n    return non_categoricals\n\n\ndef create_feature_table(\n    name: str, registry_str: str, data: list, show_header: bool = False\n) -> Table:\n    \"\"\"Create a Rich table for a feature group.\"\"\"\n    table = Table(\n        Column(name, style=\"\", no_wrap=True, width=NAME_WIDTH),\n        Column(registry_str, style=\"dim\", no_wrap=True, width=TYPE_WIDTH),\n        Column(\"\", width=VALUES_WIDTH, no_wrap=True),\n        show_header=show_header,\n        box=None,\n        pad_edge=False,\n    )\n    for row in data:\n        table.add_row(*row)\n    return table\n\n\ndef get_features_data(\n    self: Artifact | Run | Record,\n    related_data: dict | None = None,\n    to_dict: bool = False,\n    external_only: bool = False,\n):\n    from .artifact import Artifact\n\n    dictionary: dict[str, Any] = {}\n\n    if self._state.adding:\n        if to_dict:\n            return dictionary\n        else:\n            raise NotImplementedError\n\n    # feature sets\n    schema_data: dict[str, tuple[str, list[str]]] = {}\n    feature_data: dict[str, tuple[str, list[str]]] = {}\n    if not to_dict and isinstance(self, Artifact):\n        if self.id is not None and connections[self._state.db].vendor == \"postgresql\":\n            if not related_data:\n                artifact_meta = get_artifact_or_run_with_related(\n                    self,\n                    include_schema=True,\n                    include_m2m=True,\n                    include_feature_link=True,\n                )\n                related_data = artifact_meta.get(\"related_data\", {})\n            fs_data = related_data.get(\"m2m_schemas\", {}) if related_data else {}\n            for fs_id, (slot, data) in fs_data.items():\n                for registry_str, feature_names in data.items():\n                    # prevent projects show up as features\n                    if registry_str == \"Project\":\n                        continue\n                    schema = Schema.objects.using(self._state.db).get(id=fs_id)\n                    schema_data[slot] = (schema, feature_names)\n                    for feature_name in feature_names:\n                        feature_data[feature_name] = (slot, registry_str)\n            schema_data.update(\n                {\n                    slot: (schema, schema.n_members)  # type: ignore\n                    for slot, schema in get_schema_by_slot_(self).items()\n                    if slot not in schema_data\n                }\n            )\n        else:\n            for slot, schema in get_schema_by_slot_(self).items():\n                features = schema.members\n                if features.exists():\n                    # features.first() is a lot slower than features[0] here\n                    name_field = get_name_field(features[0])\n                    feature_names = list(\n                        features.values_list(name_field, flat=True)[:20]\n                    )\n                    schema_data[slot] = (schema, feature_names)\n                    for feature_name in feature_names:\n                        feature_data[feature_name] = (slot, schema.itype)\n                else:\n                    schema_data[slot] = (schema, schema.n_members)\n\n    internal_feature_names = {}\n    if isinstance(self, Artifact):\n        inferred_schemas = self.schemas.filter(itype=\"Feature\")\n        if len(inferred_schemas) > 0:\n            for schema in inferred_schemas:\n                # Use _dtype_str instead of dtype, and format for display\n                feature_dtypes = dict(schema.members.values_list(\"name\", \"_dtype_str\"))\n                # Format Record[uid] to Record[TypeName] for display\n                formatted_dtypes = {\n                    name: format_dtype_for_display(dtype_str) if dtype_str else \"\"\n                    for name, dtype_str in feature_dtypes.items()\n                }\n                internal_feature_names.update(formatted_dtypes)\n\n    # categorical feature values\n    # Get the categorical data using the appropriate method\n    # e.g. categoricals = {('tissue', 'cat[bionty.Tissue.ontology_id]'): {'brain'}, ('cell_type', 'cat[bionty.CellType]'): {'neuron'}}\n    if not self._state.adding and connections[self._state.db].vendor == \"postgresql\":\n        categoricals = get_categoricals_postgres(\n            self,\n            related_data=related_data,\n        )\n    else:\n        categoricals = get_categoricals_sqlite(\n            self,\n        )\n\n    # Get non-categorical features\n    non_categoricals = get_non_categoricals(\n        self,\n    )\n\n    internal_feature_labels = {}\n    external_data = []\n    for features, is_categoricals in [(categoricals, True), (non_categoricals, False)]:\n        for (feature_name, feature_dtype), values in sorted(features.items()):\n            # Handle dictionary conversion\n            if feature_dtype.startswith(\"list[cat\"):\n                converted_values = values  # is already a list\n            else:\n                converted_values = values if len(values) > 1 else next(iter(values))\n            if to_dict:\n                dictionary[feature_name] = converted_values\n                continue\n\n            # Format message\n            if is_categoricals and isinstance(converted_values, set):\n                printed_values = _format_values(\n                    sorted(converted_values), n=10, quotes=False\n                )\n            elif (\n                not is_categoricals\n                and not feature_dtype.startswith((\"list\", \"dict\"))\n                and isinstance(converted_values, set)\n            ):\n                printed_values = _format_values(\n                    sorted(converted_values), n=10, quotes=False\n                )\n            else:\n                printed_values = str(converted_values)\n\n            # Format dtype for display (replace Record[uid] with Record[TypeName])\n            display_dtype = format_dtype_for_display(feature_dtype)\n\n            # Sort into internal/external\n            feature_info = (\n                feature_name,\n                Text(strip_cat(display_dtype), style=\"dim\"),\n                printed_values,\n            )\n            if feature_name in internal_feature_names:\n                internal_feature_labels[feature_name] = feature_info\n            else:\n                external_data.append(feature_info)\n\n    if to_dict:\n        if external_only:\n            return {\n                k: v for k, v in dictionary.items() if k not in internal_feature_names\n            }\n        else:\n            return dictionary\n    else:\n        return (\n            internal_feature_labels,\n            feature_data,\n            schema_data,\n            internal_feature_names,\n            external_data,\n        )\n\n\ndef describe_features(\n    self: Artifact | Run | Record,\n    related_data: dict | None = None,\n) -> tuple[Tree | None, Tree | None]:\n    \"\"\"Describe features of an artifact or collection.\"\"\"\n    if self._state.adding:\n        return None, None\n    (\n        internal_feature_labels,\n        feature_data,\n        schema_data,\n        internal_feature_names,\n        external_data,\n    ) = get_features_data(\n        self,\n        related_data=related_data,\n    )\n\n    # Dataset features section\n    # internal features that contain labels (only `Feature` features contain labels)\n    internal_feature_labels_slot: dict[str, list] = {}\n    for feature_name, feature_row in internal_feature_labels.items():\n        slot, _ = feature_data.get(feature_name)\n        internal_feature_labels_slot.setdefault(slot, []).append(feature_row)\n\n    dataset_features_tree_children = []\n    for slot, (schema, feature_names_or_n) in schema_data.items():\n        if feature_names_or_n is None or isinstance(feature_names_or_n, int):\n            feature_rows = []\n        else:\n            feature_names = feature_names_or_n\n            if slot in internal_feature_labels_slot:\n                # add internal Feature features with labels\n                feature_rows = internal_feature_labels_slot[slot]\n                # add internal Feature features without labels\n                feature_rows += [\n                    (\n                        feature_name,\n                        Text(\n                            strip_cat(internal_feature_names.get(feature_name)),\n                            style=\"dim\",\n                        ),\n                        \"\",\n                    )\n                    for feature_name in feature_names\n                    if feature_name and feature_name not in internal_feature_labels\n                ]\n            else:\n                # add internal non-Feature features without labels\n                feature_rows = [\n                    (\n                        feature_name,\n                        Text(\n                            strip_cat(\n                                internal_feature_names.get(feature_name)\n                                if feature_name in internal_feature_names\n                                else schema.dtype\n                            ),\n                            style=\"dim\",\n                        ),\n                        \"\",\n                    )\n                    for feature_name in feature_names\n                    if feature_name\n                ]\n            feature_rows.sort(key=lambda x: x[0])\n        schema_itype = f\" {schema.itype}\" if schema.itype != \"Feature\" else \"\"\n        dataset_features_tree_children.append(\n            create_feature_table(\n                Text.assemble(\n                    (slot, \"violet\"),\n                    (f\" ({schema.n_members}{schema_itype})\", \"dim\"),\n                ),\n                \"\",\n                feature_rows,\n                show_header=True,\n            )\n        )\n    # external features\n    external_features_tree_children = []\n    if external_data:\n        external_features_tree_children.append(\n            create_feature_table(\n                \"\",\n                \"\",\n                external_data,\n            )\n        )\n\n    # trees\n    dataset_features_tree = None\n    if dataset_features_tree_children:\n        dataset_features_tree = Tree(\n            Text(\"Dataset features\", style=\"bold bright_magenta\")\n        )\n        for child in dataset_features_tree_children:\n            dataset_features_tree.add(child)\n    external_features_tree = None\n    if external_features_tree_children:\n        external_features_text = (\n            \"External features\"\n            if (\n                self.__class__.__name__ == \"Artifact\" and dataset_features_tree_children\n            )\n            else \"Features\"\n        )\n        external_features_tree = Tree(\n            Text(external_features_text, style=\"bold dark_orange\")\n        )\n        for child in external_features_tree_children:\n            external_features_tree.add(child)\n    return dataset_features_tree, external_features_tree\n\n\ndef infer_convert_dtype_key_value(\n    key: str, value: Any, mute: bool = False, dtype_str: str | None = None\n) -> tuple[str, Any, str]:\n    import pandas as pd\n\n    from lamindb.base.dtypes import is_valid_datetime_str\n\n    message = \"\"\n    if isinstance(value, bool):\n        return \"bool\", value, message\n    elif isinstance(value, int):\n        return \"int\", value, message\n    elif isinstance(value, float):\n        return \"float\", value, message\n    elif isinstance(value, datetime):\n        return \"datetime\", value.isoformat(), message\n    elif isinstance(value, date):\n        return \"date\", value.isoformat(), message\n    elif isinstance(value, str):\n        if dtype_str in {None, \"datetime\", \"date\"} and (\n            datetime_str := is_valid_datetime_str(value)\n        ):\n            dt_type = (\n                \"date\" if len(value) == 10 else \"datetime\"\n            )  # YYYY-MM-DD is exactly 10 characters\n            sanitized_value = datetime_str[:10] if dt_type == \"date\" else datetime_str  # type: ignore\n            return dt_type, sanitized_value, message  # type: ignore\n        else:\n            return \"cat ? str\", value, message\n    elif isinstance(value, SQLRecord):\n        # SQLRecord is not converted to JSON\n        return (f\"cat[{value.__class__.__get_name_with_module__()}]\", value, message)\n    elif isinstance(value, (Path, UPath)):\n        return \"path\", value.as_posix().rstrip(\"/\"), message\n    elif isinstance(value, Iterable) and not isinstance(value, (str, bytes)):\n        if isinstance(value, (pd.Series, np.ndarray, pd.Categorical)):\n            dtype = serialize_pandas_dtype(value.dtype)\n            if dtype == \"str\":\n                # ndarray doesn't know categorical, so there was no conscious choice\n                # offer both options\n                if isinstance(value, np.ndarray):\n                    dtype = \"cat ? str\"\n                else:\n                    # suggest to create a categorical if there are few unique values\n                    message = suggest_categorical_for_str_iterable(value, key)\n                    if message:\n                        message = f\"  # {message}\"\n            return dtype, list(value), message\n        if isinstance(value, dict):\n            return \"dict\", value, message\n        if len(value) > 0:  # type: ignore\n            first_element = next(iter(value))\n            first_element_type = type(first_element)\n            # check that all elements are of the same type\n            if all(isinstance(elem, first_element_type) for elem in value):\n                if first_element_type is bool:\n                    return \"list[bool]\", value, message\n                elif first_element_type is int:\n                    return \"list[int]\", value, message\n                elif first_element_type is float:\n                    return \"list[float]\", value, message\n                elif first_element_type is str:\n                    return (\"list[cat ? str]\", value, message)\n                elif isinstance(first_element, SQLRecord):\n                    return (\n                        f\"list[cat[{first_element_type.__get_name_with_module__()}]]\",\n                        value,\n                        message,\n                    )\n    if not mute:\n        logger.warning(f\"cannot infer feature type of: {value}, returning '?'\")\n    return \"?\", value, message\n\n\ndef _filter_one_feature_clause(\n    queryset: BasicQuerySet,\n    feature: Feature,\n    comparator: str,\n    value: Any,\n) -> BasicQuerySet:\n    from lamindb.models import Artifact\n    from lamindb.models.record import Record, RecordJson\n    from lamindb.models.run import Run\n\n    dtype_str = feature._dtype_str\n    # non-categorical features\n    if not dtype_str.startswith(\"cat\") and not dtype_str.startswith(\"list[cat\"):\n        if comparator == \"__isnull\":\n            if queryset.model is Artifact:\n                from .artifact import ArtifactJsonValue\n\n                value_subquery = ArtifactJsonValue.objects.filter(\n                    jsonvalue__feature=feature\n                ).values(\"artifact_id\")\n                return queryset.exclude(id__in=Subquery(value_subquery))\n\n        if comparator in {\"__startswith\", \"__contains\"}:\n            logger.important(\n                f\"currently not supporting `{comparator}`, using `__icontains` instead\"\n            )\n            comparator = \"__icontains\"\n        use_numeric_sqlite = (\n            connections[feature._state.db].vendor == \"sqlite\"\n            and comparator in {\"__gt\", \"__lt\", \"__gte\", \"__lte\"}\n            and dtype_str in (\"int\", \"float\", \"num\")\n        )\n        if use_numeric_sqlite:\n            # Numeric comparison via json_extract + CAST (avoids lexicographic comparison)\n            num_val_raw = RawSQL(\"CAST(json_extract(value, '$') AS REAL)\", ())\n            if queryset.model is Record:\n                value_qs = (\n                    RecordJson.objects.using(queryset.db)\n                    .filter(feature=feature)\n                    .annotate(num_val=num_val_raw)\n                    .filter(**{f\"num_val{comparator}\": value})\n                )\n                return queryset.filter(values_json__id__in=value_qs)\n            else:\n                json_values = (\n                    JsonValue.objects.using(queryset.db)\n                    .filter(feature=feature)\n                    .annotate(num_val=num_val_raw)\n                    .filter(**{f\"num_val{comparator}\": value})\n                )\n                accessor = (\n                    \"json_values\"\n                    if queryset.model in {Artifact, Run}\n                    else \"values_json\"\n                )\n                return queryset.filter(**{f\"{accessor}__id__in\": json_values})\n        else:\n            if connections[feature._state.db].vendor == \"sqlite\" and comparator in {\n                \"__gt\",\n                \"__lt\",\n                \"__gte\",\n                \"__lte\",\n            }:\n                # SQLite: lexicographic comparison for non-numeric dtypes (date, datetime, str)\n                value = str(value)\n            filter_expr = {\"feature\": feature, f\"value{comparator}\": value}\n            if queryset.model is Record:\n                value_qs = RecordJson.objects.using(queryset.db).filter(**filter_expr)\n                return queryset.filter(values_json__id__in=value_qs)\n            else:\n                json_values = JsonValue.objects.using(queryset.db).filter(**filter_expr)\n                accessor = (\n                    \"json_values\"\n                    if queryset.model in {Artifact, Run}\n                    else \"values_json\"\n                )\n                return queryset.filter(**{f\"{accessor}__id__in\": json_values})\n    # categorical features\n    elif isinstance(value, (str, SQLRecord, bool)):\n        result = parse_dtype(dtype_str)[0]\n        label_registry = result[\"registry\"]\n        _, value_field_name, filter_accessor_name = get_categorical_link_info(\n            queryset.model, label_registry, instance=queryset.db\n        )\n        if comparator == \"__isnull\":\n            kwargs = {f\"{filter_accessor_name}__feature\": feature}\n            if value:  # True\n                return queryset.exclude(**kwargs)\n            else:\n                return queryset.filter(**kwargs)\n        # because SQL is sensitive to whether querying with __in or not\n        # and might return multiple equivalent records for the latter\n        # we distinguish cases in which we have multiple label matches vs. one\n        label = None\n        labels = None\n        if isinstance(value, str):\n            field_name = result[\"field\"].field.name\n            # users might query like so:\n            # ln.Artifact.filter(experiment__contains=\"Experi\")\n            expression = {f\"{field_name}{comparator}\": value}\n            labels = result[\"registry\"].connect(queryset.db).filter(**expression)\n            if len(labels) == 0:\n                raise DoesNotExist(\n                    f\"Did not find a {label_registry.__name__} matching `{field_name}{comparator}={value}`\"\n                )\n            elif len(labels) == 1:\n                label = labels[0]\n        elif isinstance(value, SQLRecord):\n            label = value\n        new_expression = {f\"{filter_accessor_name}__feature\": feature}\n        if label is not None:\n            new_expression[f\"{filter_accessor_name}__{value_field_name}\"] = label\n        else:\n            new_expression[f\"{filter_accessor_name}__{value_field_name}__in\"] = labels\n        return queryset.filter(**new_expression)\n    raise NotImplementedError\n\n\ndef filter_with_feature_predicates(\n    queryset: BasicQuerySet,\n    predicates: list[FeaturePredicate],\n) -> BasicQuerySet:\n    qs = queryset\n    pk_name = qs.model._meta.pk.name\n    for predicate in predicates:\n        feature = predicate.feature\n        if qs.db is not None and feature._state.db != qs.db:\n            feature = Feature.connect(qs.db).get(uid=feature.uid)\n        if predicate.comparator == \"__ne\":\n            subset = _filter_one_feature_clause(\n                qs, feature=feature, comparator=\"\", value=predicate.value\n            )\n            qs = qs.exclude(**{f\"{pk_name}__in\": Subquery(subset.values(pk_name))})\n        else:\n            qs = _filter_one_feature_clause(\n                qs,\n                feature=feature,\n                comparator=predicate.comparator,\n                value=predicate.value,\n            )\n    return qs\n\n\ndef filter_base(\n    queryset: BasicQuerySet,\n    _skip_validation: bool = True,\n    **expression,\n) -> BasicQuerySet:\n    from lamindb.models import BasicQuerySet, QuerySet\n\n    assert isinstance(queryset, BasicQuerySet) and not isinstance(queryset, QuerySet)  # noqa: S101\n    keys_normalized = [key.split(\"__\")[0] for key in expression]\n    if not _skip_validation:\n        validated = Feature.connect(queryset.db).validate(\n            keys_normalized, field=\"name\", mute=True\n        )\n        if sum(validated) != len(keys_normalized):\n            raise ValidationError(\n                f\"Some keys in the filter expression are not registered as features: {np.array(keys_normalized)[~validated]}\"\n            )\n    features = Feature.connect(queryset.db).filter(name__in=keys_normalized).distinct()\n    qs = queryset\n    for key, value in expression.items():\n        split_key = key.split(\"__\")\n        normalized_key = split_key[0]\n        comparator = \"\"\n        if len(split_key) == 2:\n            comparator = f\"__{split_key[1]}\"\n        feature = features.get(name=normalized_key)\n        qs = _filter_one_feature_clause(\n            qs, feature=feature, comparator=comparator, value=value\n        )\n    if qs is queryset:\n        raise NotImplementedError\n    return qs\n\n\ndef filter_with_features(\n    queryset: BasicQuerySet, *queries, **expressions\n) -> BasicQuerySet:\n    from lamindb.models import BasicQuerySet, QuerySet\n\n    feature_predicates = [q for q in queries if isinstance(q, FeaturePredicate)]\n    non_feature_queries = [q for q in queries if not isinstance(q, FeaturePredicate)]\n\n    if isinstance(queryset, QuerySet):\n        # need to avoid infinite recursion because\n        # filter_with_features is called in queryset.filter otherwise\n        filter_kwargs = {\"_skip_filter_with_features\": True}\n    else:\n        filter_kwargs = {}\n    registry = queryset.model\n    qs = queryset\n    if expressions:\n        keys_normalized = [key.split(\"__\")[0] for key in expressions]\n        field_or_feature = keys_normalized[0]\n        if field_or_feature in registry.__get_available_fields__():\n            qs = queryset.filter(*non_feature_queries, **expressions, **filter_kwargs)\n        elif all(\n            features_validated := Feature.objects.using(queryset.db).validate(\n                keys_normalized, field=\"name\", mute=True\n            )\n        ):\n            # filter_base requires qs to be BasicQuerySet\n            qs = filter_base(\n                queryset._to_class(BasicQuerySet, copy=True),\n                _skip_validation=True,\n                **expressions,\n            )._to_class(type(queryset), copy=False)\n            qs = qs.filter(*non_feature_queries, **filter_kwargs)\n        else:\n            features = \", \".join(sorted(np.array(keys_normalized)[~features_validated]))\n            message = f\"feature names: {features}\"\n            avail_fields = registry.__get_available_fields__()\n            fields = \", \".join(sorted(avail_fields))\n            raise InvalidArgument(\n                f\"You can query either by available fields: {fields}\\n\"\n                f\"Or fix invalid {message}\"\n            )\n    else:\n        # Always route through `.filter()` here (even when empty) so the\n        # standard QuerySet path can inject default branch constraints.\n        qs = queryset.filter(*non_feature_queries, **filter_kwargs)\n    if feature_predicates:\n        qs = filter_with_feature_predicates(\n            qs._to_class(BasicQuerySet, copy=True),\n            feature_predicates,\n        )._to_class(type(qs), copy=False)\n    return qs\n\n\nclass FeatureManager:\n    \"\"\"Feature manager.\"\"\"\n\n    def __init__(self, sqlrecord: Artifact | Run | Record):\n        # host is the sqlrecord that the label manager is attached to\n        # we might rename _host to _sqlrecord in the future\n        self._host = sqlrecord\n        self._slots: dict[str, Schema] | None = None\n        self._accessor_by_registry_ = None\n\n    def __repr__(self) -> str:\n        return self.describe(return_str=True)  # type: ignore\n\n    def describe(self, return_str: bool = False) -> str | None:\n        \"\"\"Pretty print features.\n\n        This is what `artifact.describe()` calls under the hood.\n        \"\"\"\n        dataset_features_tree, external_features_tree = describe_features(self._host)  # type: ignore\n        tree = describe_header(self._host)\n        if dataset_features_tree:\n            tree.add(dataset_features_tree)\n        if external_features_tree:\n            tree.add(external_features_tree)\n        return format_rich_tree(tree, return_str=return_str)\n\n    def get_values(self, external_only: bool = False) -> dict[str, Any]:\n        \"\"\"Get features as a dictionary.\n\n        Includes annotation with internal and external feature values.\n\n        Args:\n            external_only: If `True`, only return external feature annotations.\n        \"\"\"\n        return get_features_data(self._host, to_dict=True, external_only=external_only)  # type: ignore\n\n    def __getitem__(\n        self, feature: str\n    ) -> (\n        DtypeObject\n        | BasicQuerySet\n        | SQLRecord\n        | SQLRecordList\n        | dict[str, DtypeObject | BasicQuerySet | SQLRecord | SQLRecordList]\n    ):\n        \"\"\"Get values by feature name.\n\n        Args:\n            feature: Feature name.\n\n        Returns:\n            - For categorical features, return value records.\n            - For non-categorical features, return values.\n\n        Example::\n\n            artifact.features['tissue']\n            #> Tissue(id=1, name='brain', ...)\n        \"\"\"\n        from collections import defaultdict\n\n        import pandas as pd\n\n        from .query_set import SQLRecordList\n\n        host_name = self._host.__class__.__name__\n        host_id = self._host.id\n        host_db = self._host._state.db\n        feature_records = list(Feature.objects.using(host_db).filter(name=feature))\n        if not feature_records:\n            raise ValidationError(f\"Feature with name {feature} not found\")\n\n        # group cat feature_records by their registry\n        registry_to_features = defaultdict(list)\n        for feature_record in feature_records:\n            parsed_dtype = parse_dtype(feature_record._dtype_str)\n            if len(parsed_dtype) > 0:  # categorical features\n                registry = parsed_dtype[0][\"registry\"]\n                registry_name = registry.__get_name_with_module__()\n                registry_to_features[(registry, registry_name)].append(\n                    feature_record.id\n                )\n            else:  # non-categorical features\n                registry_to_features[(JsonValue, \"JsonValue\")].append(feature_record.id)\n\n        value_records = {}\n\n        # query once per registry with all feature_ids\n        for (registry, registry_name), feature_ids in registry_to_features.items():\n            if registry_name == \"JsonValue\":\n                # for non-categorical features\n                filters = {\n                    \"feature_id__in\": feature_ids,\n                    f\"links_{host_name.lower()}__{host_name.lower()}_id\": host_id,\n                }\n                dtype_values = (\n                    registry.objects.using(host_db)\n                    .filter(**filters)\n                    .distinct()\n                    .values_list(\"feature___dtype_str\", \"value\")\n                )\n                feature_values_qs = []\n                for dtype, value in dtype_values:\n                    if dtype == \"date\":\n                        value = pd.to_datetime(value, format=\"ISO8601\").date()\n                    elif dtype == \"datetime\":\n                        value = datetime.fromisoformat(value)\n                    feature_values_qs.append(value)\n            else:\n                # determine links name once per registry\n                links_value_name = (\n                    \"links_value\"\n                    if registry_name == host_name\n                    else f\"links_{host_name.lower()}\"\n                )\n\n                filters = {\n                    f\"{links_value_name}__feature_id__in\": feature_ids,\n                    f\"{links_value_name}__{host_name.lower()}_id\": host_id,\n                }\n\n                feature_values_qs = (\n                    registry.objects.using(host_db).filter(**filters).distinct()\n                )\n\n            if len(feature_values_qs) == 1:\n                value_records[registry_name] = feature_values_qs[0]\n            elif len(feature_values_qs) > 1:\n                if feature_record.dtype_as_str.startswith(\"list[\"):\n                    value_records[registry_name] = SQLRecordList(feature_values_qs)\n                else:\n                    value_records[registry_name] = feature_values_qs\n\n        return (\n            next(iter(value_records.values()))\n            if len(value_records) == 1\n            else value_records\n        )\n\n    @property\n    def slots(self) -> dict[str, Schema]:\n        \"\"\"Features by schema slot.\n\n        Example::\n\n            artifact.features.slots\n            #> {'var': <Schema: var>, 'obs': <Schema: obs>}\n        \"\"\"\n        if self._slots is None:\n            self._slots = get_schema_by_slot_(self._host)\n        return self._slots\n\n    @property\n    def _accessor_by_registry(self):\n        \"\"\"Accessor by registry.\"\"\"\n        if self._accessor_by_registry_ is None:\n            self._accessor_by_registry_ = get_accessor_by_registry_(self._host)\n        return self._accessor_by_registry_\n\n    def _add_label_feature_links(\n        self,\n        features_labels,\n    ):\n        host_name = self._host.__class__.__name__.lower()\n        host_is_record = host_name == \"record\"\n        instance = getattr(self._host._state, \"db\", None)\n        for class_name, registry_features_labels in features_labels.items():\n            if not host_is_record and class_name == \"Collection\":\n                continue\n            registry_features_labels[0][0]\n            label_registry = registry_features_labels[0][1].__class__\n            link_model, value_field_name, _ = get_categorical_link_info(\n                self._host.__class__, label_registry, instance=instance\n            )\n            field_name = f\"{value_field_name}_id\"\n            host_fk = f\"{host_name}_id\"\n            links = [\n                link_model(\n                    **{\n                        host_fk: self._host.id,\n                        \"feature_id\": ftr.id,\n                        field_name: label.id,\n                    }\n                )\n                for (ftr, label) in registry_features_labels\n            ]\n            try:\n                save(links, ignore_conflicts=False)\n            except Exception:\n                save(links, ignore_conflicts=True)\n\n    def _get_feature_objects(self, dictionary, feature_field):\n        from ..core._functions import get_current_tracked_run\n\n        registry = feature_field.field.model\n        keys = list(dictionary.keys())\n        feature_objects = registry.from_values(keys, field=feature_field, mute=True)\n        feature_objects = keep_topmost_matches(feature_objects)\n        if len(feature_objects) != len(keys):\n            not_validated_keys = [\n                key for key in keys if key not in feature_objects.to_list(\"name\")\n            ]\n            not_validated_keys_dtype_message = [\n                (key, infer_convert_dtype_key_value(key, dictionary[key]))\n                for key in not_validated_keys\n            ]\n            run = get_current_tracked_run()\n            if run is not None:\n                name = f\"{run.transform.kind}[{run.transform.key}]\"\n                type_hint = f\"\"\"  feature_type = ln.Feature(name='{name}', is_type=True).save()\"\"\"\n                elements = [type_hint]\n                type_kwarg = \", type=feature_type\"\n            else:\n                elements = []\n                type_kwarg = \"\"\n            elements += [\n                f\"  ln.Feature(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}\"\n                for key, (dtype, _, message) in not_validated_keys_dtype_message\n            ]\n            hint = \"\\n\".join(elements)\n            msg = (\n                f\"These keys could not be validated: {not_validated_keys}\\n\"\n                f\"Here is how to create a feature:\\n\\n{hint}\"\n            )\n            raise ValidationError(msg)\n        return feature_objects\n\n    def _resolve_feature_value_dictionary(\n        self,\n        values: dict[str | Feature, Any],\n    ) -> tuple[dict[str, Any], dict[str, Any], list[Feature], dict[str, Any]]:\n        \"\"\"Normalize a feature-value dictionary to support `str` and `Feature` keys.\n\n        Returns:\n            normalized_values: Values keyed by feature name (used by schema validators).\n            string_key_values: Subset of values that came from string keys only.\n            explicit_features: Resolved Feature objects passed explicitly as keys.\n            values_by_feature_uid: Values keyed by feature uid (used for exact lookup).\n        \"\"\"\n        host_db = self._host._state.db\n        normalized_values: dict[str, Any] = {}\n        string_key_values: dict[str, Any] = {}\n        explicit_features: list[Feature] = []\n        values_by_feature_uid: dict[str, Any] = {}\n        seen_explicit_uids: set[str] = set()\n\n        for key, value in values.items():\n            if isinstance(key, Feature):\n                if key._state.adding:\n                    raise ValidationError(\n                        f\"Please save feature '{key.name}' before annotation.\"\n                    )\n                feature = key\n                # Mirror feature predicate resolution: resolve Feature objects on active DB.\n                if host_db is not None and feature._state.db != host_db:\n                    feature = Feature.connect(host_db).get(uid=feature.uid)\n                if feature.uid in values_by_feature_uid and (\n                    values_by_feature_uid[feature.uid] != value\n                ):\n                    raise ValidationError(\n                        f\"Conflicting values for feature '{feature.name}'.\"\n                    )\n                values_by_feature_uid[feature.uid] = value\n                if feature.uid not in seen_explicit_uids:\n                    explicit_features.append(feature)\n                    seen_explicit_uids.add(feature.uid)\n                if (\n                    feature.name in normalized_values\n                    and normalized_values[feature.name] != value\n                ):\n                    raise ValidationError(\n                        f\"Conflicting values for feature name '{feature.name}'.\"\n                    )\n                normalized_values[feature.name] = value\n            elif isinstance(key, str):\n                if key in normalized_values and normalized_values[key] != value:\n                    raise ValidationError(\n                        f\"Conflicting values for feature name '{key}'.\"\n                    )\n                normalized_values[key] = value\n                string_key_values[key] = value\n            else:\n                raise TypeError(\n                    \"Feature-value dictionary keys must be `str` or `Feature`, \"\n                    f\"got {type(key)}\"\n                )\n\n        return (\n            normalized_values,\n            string_key_values,\n            explicit_features,\n            values_by_feature_uid,\n        )\n\n    @staticmethod\n    def _merge_feature_objects(\n        explicit_features: list[Feature],\n        looked_up_features,\n    ) -> list[Feature]:\n        merged: list[Feature] = []\n        seen_uids: set[str] = set()\n        for feature in explicit_features:\n            if feature.uid not in seen_uids:\n                merged.append(feature)\n                seen_uids.add(feature.uid)\n        for feature in looked_up_features:\n            if feature.uid not in seen_uids:\n                merged.append(feature)\n                seen_uids.add(feature.uid)\n        return merged\n\n    @staticmethod\n    def _raise_not_validated_values(\n        not_validated_values: dict[str, tuple[str, list[str]]],\n    ) -> None:\n        if not not_validated_values:\n            return None\n        hint = \"\"\n        for key, (field, values_list) in not_validated_values.items():\n            key_str = \"ln.Record\" if key == \"Record\" else key\n            create_true = \", create=True\" if \"bionty.\" not in key else \"\"\n            hint += f\"  records = {key_str}.from_values({values_list}, field='{field}'{create_true}).save()\\n\"\n        msg = (\n            f\"These values could not be validated: {dict(not_validated_values)}\\n\"\n            f\"Here is how to create records for them:\\n\\n{hint}\"\n        )\n        raise ValidationError(msg)\n\n    def _collect_record_feature_writes(\n        self,\n        *,\n        record,\n        feature_objects: list[Feature],\n        dictionary: dict[str, Any],\n        values_by_feature_uid: dict[str, Any] | None,\n        feature_json_values: list,\n        links_by_model: dict,\n        not_validated_values: dict[str, tuple[str, list[str]]],\n        resolved_records_by_feature_id: dict[int, dict[Any, list[SQLRecord]]]\n        | None = None,\n    ) -> None:\n        from ..base.dtypes import is_iterable_of_sqlrecord\n        from .can_curate import CanCurate\n        from .record import RecordJson\n\n        for feature in feature_objects:\n            if (\n                values_by_feature_uid is not None\n                and feature.uid in values_by_feature_uid\n            ):\n                value = values_by_feature_uid[feature.uid]\n            else:\n                value = dictionary[feature.name]\n            if value is None:\n                continue\n            if not (\n                feature.dtype_as_str.startswith(\"cat\")\n                or feature.dtype_as_str.startswith(\"list[cat\")\n            ):\n                _, converted_value, _ = infer_convert_dtype_key_value(\n                    key=feature.name, value=value, dtype_str=feature.dtype_as_str\n                )\n                feature_json_values.append(\n                    RecordJson(record=record, feature=feature, value=converted_value)\n                )\n                continue\n\n            if isinstance(value, SQLRecord) or is_iterable_of_sqlrecord(value):\n                if isinstance(value, SQLRecord):\n                    label_records = [value]\n                else:\n                    label_records = value  # type: ignore\n            else:\n                if isinstance(value, str):\n                    values = [value]  # type: ignore\n                else:\n                    values = value  # type: ignore\n                if feature._dtype_str == \"cat\":\n                    feature._dtype_str = \"cat[ULabel]\"\n                    feature.save()\n                    result = {\n                        \"registry_str\": \"ULabel\",\n                        \"registry\": ULabel,\n                        \"field\": ULabel.name,\n                    }\n                else:\n                    result = parse_dtype(feature._dtype_str)[0]\n                # Fast path for dataframe-originated record batches:\n                # `bulk_set_features_in_records()` now runs a single `DataFrameCurator`\n                # pass and pre-resolves categorical values to label records.\n                #\n                # The cache key is feature.id and the nested key is the normalized\n                # raw value found in the dataframe. Using this cache here avoids\n                # running per-row `validate()` + `from_values()` calls, which used\n                # to duplicate work already done by the curator.\n                cached_records = None\n                if (\n                    resolved_records_by_feature_id is not None\n                    and feature.id in resolved_records_by_feature_id\n                ):\n                    cached_records = resolved_records_by_feature_id[feature.id]\n                if cached_records is not None:\n                    if isinstance(value, str):\n                        values_for_lookup = [value]\n                    else:\n                        values_for_lookup = value  # type: ignore\n                    if isinstance(values_for_lookup, (list, tuple, np.ndarray, set)):\n                        values_for_lookup = list(values_for_lookup)\n                    else:\n                        values_for_lookup = [values_for_lookup]\n                    label_records = []\n                    not_validated_for_feature = []\n                    for lookup_value in values_for_lookup:\n                        normalized_lookup = (\n                            lookup_value.item()\n                            if isinstance(lookup_value, np.generic)\n                            else lookup_value\n                        )\n                        mapped_records = cached_records.get(normalized_lookup)\n                        if mapped_records is None:\n                            # Keep the same error aggregation behavior as before:\n                            # unresolved categorical values are collected and raised\n                            # in one ValidationError after all records are processed.\n                            not_validated_for_feature.append(normalized_lookup)\n                        else:\n                            label_records.extend(mapped_records)\n                    if not_validated_for_feature:\n                        not_validated_values[result[\"registry_str\"]] = (  # type: ignore\n                            result[\"field_str\"],\n                            not_validated_for_feature,\n                        )\n                elif issubclass(result[\"registry\"], CanCurate):  # type: ignore\n                    # Fallback path for non-batch callers (e.g. direct\n                    # `record.features.add_values()` on an individual record).\n                    #\n                    # Those flows do not build dataframe-level caches, so we keep\n                    # the original registry-backed validation and resolution logic.\n                    # This branch should not be hot for the dataframe batch import\n                    # path because that path provides `resolved_records_by_feature_id`.\n                    validated = result[\"registry\"].validate(  # type: ignore\n                        values, field=result[\"field\"], mute=True\n                    )\n                    values_array = np.array(values)\n                    validated_values = values_array[validated]\n                    if validated.sum() != len(values):\n                        not_validated_values[result[\"registry_str\"]] = (  # type: ignore\n                            result[\"field_str\"],\n                            values_array[~validated].tolist(),\n                        )\n                    label_records = result[\"registry\"].from_values(  # type: ignore\n                        validated_values, field=result[\"field\"], mute=True\n                    )\n                else:\n                    label_records = result[\"registry\"].filter(  # type: ignore\n                        **{f\"{result['field_str']}__in\": values}\n                    )\n                    if len(label_records) != len(values):\n                        raise ValidationError(\n                            f\"Some of these values for {result['registry_str']} do not exist: {values}\"\n                        )\n            for label_record in label_records:\n                if label_record._state.adding:\n                    raise ValidationError(\n                        f\"Please save {label_record} before annotation.\"\n                    )\n                link_model, value_field_name, _ = get_categorical_link_info(\n                    record.__class__,\n                    label_record.__class__,\n                    instance=getattr(record._state, \"db\", None),\n                )\n                links_by_model[link_model].append(\n                    link_model(\n                        record_id=record.id,\n                        feature_id=feature.id,\n                        **{f\"{value_field_name}_id\": label_record.id},\n                    )\n                )\n        return None\n\n    def add_values(\n        self,\n        values: dict[str | Feature, Any],\n        feature_field: FieldAttr = Feature.name,\n        schema: Schema = None,\n    ) -> None:\n        \"\"\"Add values for features.\n\n        Like `set_values()`, but slightly more performant because it does not remove previously-existing feature annotations at the danger\n        of violating multiplicity of categorical dtypes (see warning below).\n\n        Args:\n            values: A dictionary of keys (features) & values (labels, strings, numbers, booleans, datetimes, etc.).\n                Keys can be feature names (`str`) or `Feature` objects.\n                If a value is `None`, it will be skipped.\n            feature_field: The field of a registry to map the keys of the `values` dictionary in case strings are passed.\n            schema: Schema to validate against.\n\n        .. warning::\n\n            If you run::\n\n                obj.features.add_values({\"my_categorical\": \"my_category1\"})\n                obj.features.add_values({\"my_categorical\": \"my_category2\"})\n\n            you will annotate the object with two different values for the same feature even if its dtype is not a `list`.\n            That is, `add_values()` does **not** validate the `dtype` of a categorical feature across multiple calls.\n\n            To avoid this, please use `set_values()`.\n\n        .. dropdown:: Why is multiplicity of categorical dtypes not validated?\n\n            For simple data types like `int`, `date`, `dict`, etc., `add_values()` ensures that there is only\n            one value for a given `Record` and feature.\n\n            But for categorical/relational features or for simple dtypes in the context of annotating an `Artifact`, the underlying link table allows linking multiple\n            values to the same object and feature, so that both `list` dtypes and `set`-like aggregations on an object\n            can be represented with relational integrity.\n\n            Examples::\n\n                # the following needs to be allowed even if `cell_type` has dtype `CellType`, and not `list[CellType]`\n                # this is because the artifact might be a `DataFrame` with a column `cell_type` that has dtype `CellType`\n                # and the annotations on the artifact-level represent the aggregation of all values in that column\n                artifact.features.add_values({\"cell_type\": \"B cell\"})\n                artifact.features.add_values({\"cell_type\": \"T cell\"})\n                artifact.features.add_values({\"cell_type\": \"NK cell\"})\n\n                # now an example for Record\n                # while a record will never represent an aggregation, we still want to express\n                # lists of values with relational integrity, for instance, this\n                record.features.add_values({\"cell_types\": [\"B cell\", \"T cell\", \"NK cell\"]})\n\n        \"\"\"\n        from lamindb.curators.core import ExperimentalDictCurator\n\n        host_is_record = self._host.__class__.__name__ == \"Record\"\n        host_is_artifact = self._host.__class__.__name__ == \"Artifact\"\n        # rename to distinguish from the values inside the dict\n        (\n            dictionary,\n            string_key_values,\n            explicit_features,\n            values_by_feature_uid,\n        ) = self._resolve_feature_value_dictionary(values)\n        keys = dictionary.keys()\n        if isinstance(keys, DICT_KEYS_TYPE):\n            keys = list(keys)  # type: ignore\n        if (\n            host_is_record\n            and self._host.type is not None\n            and self._host.type.schema is not None  # type: ignore\n        ):\n            assert schema is None, \"Cannot pass schema if record.type has schema.\"\n            schema = self._host.type.schema  # type: ignore\n        if host_is_artifact:\n            if self._get_external_schema():\n                raise ValueError(\"Cannot add values if artifact has external schema.\")\n        if schema is not None:\n            member_ids = set(schema.members.values_list(\"id\", flat=True))\n            features_not_in_schema = [\n                feature.name\n                for feature in explicit_features\n                if feature.id not in member_ids\n            ]\n            if features_not_in_schema:\n                raise ValidationError(\n                    \"These feature keys are not in the provided schema: \"\n                    f\"{features_not_in_schema}\"\n                )\n            looked_up_features = schema.members.filter(name__in=keys)\n            feature_objects = self._merge_feature_objects(\n                explicit_features, looked_up_features\n            )\n        else:\n            if string_key_values:\n                looked_up_features = self._get_feature_objects(\n                    string_key_values, feature_field\n                )\n            else:\n                looked_up_features = Feature.objects.none()\n            feature_objects = self._merge_feature_objects(\n                explicit_features, looked_up_features\n            )\n            schema = Schema(feature_objects)\n        ExperimentalDictCurator(\n            dictionary, schema, require_saved_schema=False\n        ).validate()\n        return self._add_values(\n            feature_objects,\n            dictionary,\n            values_by_feature_uid=values_by_feature_uid,\n        )\n\n    def _add_values(\n        self,\n        feature_objects,\n        dictionary,\n        *,\n        values_by_feature_uid: dict[str, Any] | None = None,\n    ):\n        from ..base.dtypes import is_iterable_of_sqlrecord\n        from .can_curate import CanCurate\n\n        host_is_record = self._host.__class__.__name__ == \"Record\"\n        if host_is_record:\n            feature_json_values: list[SQLRecord] = []\n            links_by_model: dict[type[SQLRecord], list[SQLRecord]] = defaultdict(list)\n            record_not_validated_values: dict[str, tuple[str, list[str]]] = {}\n            self._collect_record_feature_writes(\n                record=self._host,\n                feature_objects=feature_objects,\n                dictionary=dictionary,\n                values_by_feature_uid=values_by_feature_uid,\n                feature_json_values=feature_json_values,\n                links_by_model=links_by_model,\n                not_validated_values=record_not_validated_values,\n            )\n            self._raise_not_validated_values(record_not_validated_values)\n            if feature_json_values:\n                save(feature_json_values)\n            for links in links_by_model.values():\n                try:\n                    save(links, ignore_conflicts=False)\n                except Exception:\n                    save(links, ignore_conflicts=True)\n            return None\n\n        features_labels = defaultdict(list)\n        feature_json_values = []\n        not_validated_values: dict[str, tuple[str, list[str]]] = {}\n        for feature in feature_objects:\n            if (\n                values_by_feature_uid is not None\n                and feature.uid in values_by_feature_uid\n            ):\n                value = values_by_feature_uid[feature.uid]\n            else:\n                value = dictionary[feature.name]\n            if value is None:\n                continue\n            if not (\n                feature.dtype_as_str.startswith(\"cat\")\n                or feature.dtype_as_str.startswith(\"list[cat\")\n            ):\n                _, converted_value, _ = infer_convert_dtype_key_value(\n                    key=feature.name, value=value, dtype_str=feature.dtype_as_str\n                )\n                filter_kwargs = {\"feature\": feature, \"value\": converted_value}\n                feature_value, _ = JsonValue.get_or_create(**filter_kwargs)\n                feature_json_values.append(feature_value)\n            else:\n                if isinstance(value, SQLRecord) or is_iterable_of_sqlrecord(value):\n                    if isinstance(value, SQLRecord):\n                        label_records = [value]\n                    else:\n                        label_records = value  # type: ignore\n                    for record in label_records:\n                        if record._state.adding:\n                            raise ValidationError(\n                                f\"Please save {record} before annotation.\"\n                            )\n                        features_labels[\n                            record.__class__.__get_name_with_module__()\n                        ].append((feature, record))\n                else:\n                    if isinstance(value, str):\n                        values = [value]  # type: ignore\n                    else:\n                        values = value  # type: ignore\n                    if feature._dtype_str == \"cat\":\n                        new_dtype_str = feature._dtype_str + \"[ULabel]\"\n                        feature._dtype_str = new_dtype_str\n                        feature.save()\n                        result = {\n                            \"registry_str\": \"ULabel\",\n                            \"registry\": ULabel,\n                            \"field\": ULabel.name,\n                        }\n                    else:\n                        result = parse_dtype(feature._dtype_str)[0]\n                    if issubclass(result[\"registry\"], CanCurate):  # type: ignore\n                        validated = result[\"registry\"].validate(  # type: ignore\n                            values, field=result[\"field\"], mute=True\n                        )\n                        values_array = np.array(values)\n                        validated_values = values_array[validated]\n                        if validated.sum() != len(values):\n                            not_validated_values[result[\"registry_str\"]] = (  # type: ignore\n                                result[\"field_str\"],\n                                values_array[~validated].tolist(),\n                            )\n                        label_records = result[\"registry\"].from_values(  # type: ignore\n                            validated_values, field=result[\"field\"], mute=True\n                        )\n                    else:\n                        label_records = result[\"registry\"].filter(  # type: ignore\n                            **{f\"{result['field_str']}__in\": values}\n                        )\n                        if len(label_records) != len(values):\n                            raise ValidationError(\n                                f\"Some of these values for {result['registry_str']} do not exist: {values}\"\n                            )\n                    features_labels[result[\"registry_str\"]] += [  # type: ignore\n                        (feature, label_record) for label_record in label_records\n                    ]\n        # TODO: given we had already validated prior to calling _add_values, this block below should never be reached\n        # refactor this out if possible\n        self._raise_not_validated_values(not_validated_values)\n        if features_labels:\n            self._add_label_feature_links(features_labels)\n        if feature_json_values:\n            to_insertjson_values = [\n                record for record in feature_json_values if record._state.adding\n            ]\n            if to_insertjson_values:\n                save(to_insertjson_values)\n            links = [\n                self._host.json_values.through(\n                    **{\n                        f\"{self._host.__class__.__name__.lower()}_id\": self._host.id,\n                        \"jsonvalue_id\": json_value.id,\n                    }\n                )\n                for json_value in feature_json_values\n            ]\n            # a link might already exist, hence ignore_conflicts is needed\n            save(links, ignore_conflicts=True)\n\n    def set_values(\n        self,\n        values: dict[str | Feature, Any],\n        feature_field: FieldAttr = Feature.name,\n        schema: Schema = None,\n    ) -> None:\n        \"\"\"Set values for features.\n\n        Note that, in the context of annotating an `Artifact`, this does **not** affect the annotations derived from the artifact's dataset features. It only sets\n        the artifact's external feature annotations.\n\n        Args:\n            values: A dictionary of keys (features) & values (labels, strings, numbers, booleans, datetimes, etc.).\n                Keys can be feature names (`str`) or `Feature` objects.\n                If a value is `None`, it will be skipped.\n            feature_field: The field of a registry to map the keys of the `values` dictionary in case strings are passed.\n            schema: Schema to validate against.\n\n        Examples:\n\n            Here is how to annotate an artifact ad hoc::\n\n                artifact.features.set_values({\n                    \"species\": \"human\",\n                    \"scientist\": ['Barbara McClintock', 'Edgar Anderson'],\n                    \"temperature\": 27.6,\n                    \"experiment\": \"Experiment 1\"\n                })\n\n            Query artifacts by features::\n\n                ln.Artifact.filter(scientist=\"Barbara McClintock\")\n\n            If your feature names are ambiguous, you can use a `Feature` object to disambiguate::\n\n                temperature = ln.Feature.get(name=\"temperature\", type__name=\"my_feature_type\")\n\n                # to set feature values\n                artifact.features.set_values({temperature: 0.5})  # temperature is the feature object\n\n                # to query by feature values\n                ln.Artifact.filter(temperature == 0.5)  # instead of temperature=0.5\n\n            You can pass a schema to validate the dictionary::\n\n                schema = ln.Schema([ln.Feature(name=\"species\", dtype=str).save()]).save()\n                artifact.features.set_values({\"species\": \"bird\"}, schema=schema)\n\n            Also see :class:`lamindb.Artifact.features`, :class:`lamindb.Record.features`, and :class:`lamindb.Run.features`.\n        \"\"\"\n        from lamindb.curators.core import ExperimentalDictCurator\n\n        host_is_record = self._host.__class__.__name__ == \"Record\"\n        host_is_artifact = self._host.__class__.__name__ == \"Artifact\"\n        # rename to distinguish from the values inside the dict\n        (\n            dictionary,\n            string_key_values,\n            explicit_features,\n            values_by_feature_uid,\n        ) = self._resolve_feature_value_dictionary(values)\n        keys = dictionary.keys()\n        if isinstance(keys, DICT_KEYS_TYPE):\n            keys = list(keys)  # type: ignore\n        if (\n            host_is_record\n            and self._host.type is not None\n            and self._host.type.schema is not None  # type: ignore\n        ):\n            assert schema is None, \"Cannot pass schema if record.type has schema.\"\n            schema = self._host.type.schema  # type: ignore\n        if host_is_artifact:\n            schema = self._get_external_schema()\n        if schema is not None:\n            ExperimentalDictCurator(dictionary, schema).validate()\n            member_ids = set(schema.members.values_list(\"id\", flat=True))\n            features_not_in_schema = [\n                feature.name\n                for feature in explicit_features\n                if feature.id not in member_ids\n            ]\n            if features_not_in_schema:\n                raise ValidationError(\n                    \"These feature keys are not in the provided schema: \"\n                    f\"{features_not_in_schema}\"\n                )\n            looked_up_features = schema.members.filter(name__in=keys)\n            feature_objects = self._merge_feature_objects(\n                explicit_features, looked_up_features\n            )\n        else:\n            if string_key_values:\n                looked_up_features = self._get_feature_objects(\n                    string_key_values, feature_field\n                )\n            else:\n                looked_up_features = Feature.objects.none()\n            feature_objects = self._merge_feature_objects(\n                explicit_features, looked_up_features\n            )\n        self._remove_values()\n        self._add_values(\n            feature_objects,\n            dictionary=dictionary,\n            values_by_feature_uid=values_by_feature_uid,\n        )\n\n    def _get_external_schema(self) -> Schema | None:\n        external_schema = None\n        if self._host.otype is None:\n            external_schema = self._host.schema\n        elif self._host.schema is not None:\n            external_schema = self._host.schema.slots.get(\"__external__\", None)\n        return external_schema\n\n    def remove_values(\n        self,\n        feature: (\n            str | Feature | list[str | Feature] | dict[str | Feature, Any | None] | None\n        ) = None,\n        *,\n        value: Any | None = None,\n    ) -> None:\n        \"\"\"Remove values for features.\n\n        Args:\n            feature: Indicate one or several features for which to remove values.\n                If `None`, values for all external features will be removed.\n                Also supports a dictionary mapping feature keys to values to remove,\n                e.g. `{feature: value}`.\n            value: An optional value to restrict removal to a single value.\n        \"\"\"\n        host_name = self._host.__class__.__name__.lower()\n        host_is_artifact = host_name == \"artifact\"\n\n        if host_is_artifact:\n            external_schema = self._get_external_schema()\n            if external_schema is not None:\n                raise ValueError(\n                    \"Cannot remove values if artifact has external schema.\"\n                )\n        return self._remove_values(\n            feature,\n            value=value,\n        )\n\n    def _remove_values(\n        self,\n        feature: (\n            str | Feature | list[str | Feature] | dict[str | Feature, Any | None] | None\n        ) = None,\n        *,\n        value: Any | None = None,\n    ) -> None:\n        from django.apps import apps\n\n        host_name = self._host.__class__.__name__.lower()\n        host_is_record = host_name == \"record\"\n        host_is_artifact = host_name == \"artifact\"\n\n        if isinstance(feature, dict):\n            if value is not None:\n                raise ValueError(\n                    \"Pass either `value=` or per-feature values via a dictionary, not both.\"\n                )\n            for one_feature, one_value in feature.items():\n                self._remove_values(one_feature, value=one_value)\n            return\n        if feature is None:\n            features = get_features_data(\n                self._host, to_dict=True, external_only=True\n            ).keys()\n        elif not isinstance(feature, list):\n            features = [feature]\n        else:\n            features = feature\n        for feature in features:\n            if isinstance(feature, str):\n                feature_record = Feature.get(name=feature)\n            else:\n                feature_record = feature\n                if feature_record._state.adding:\n                    raise ValidationError(\n                        f\"Please save feature '{feature_record.name}' before annotation.\"\n                    )\n                if (\n                    self._host._state.db is not None\n                    and feature_record._state.db != self._host._state.db\n                ):\n                    feature_record = Feature.connect(self._host._state.db).get(\n                        uid=feature_record.uid\n                    )\n            if host_is_artifact:\n                for schema in self.slots.values():\n                    if feature_record in schema.members:\n                        raise ValueError(\"Cannot remove values for dataset features.\")\n            filter_kwargs = {\"feature\": feature_record}\n            none_message = f\"with value {value!r} \" if value is not None else \"\"\n            if feature_record._dtype_str.startswith((\"cat[\", \"list[cat\")):  # type: ignore\n                feature_registry = parse_dtype(feature_record._dtype_str)[0][\n                    \"registry_str\"\n                ]\n                if \".\" in feature_registry:\n                    parts = feature_registry.split(\".\")\n                    app_label = parts[0]\n                    entity_name = parts[-1]\n                else:\n                    app_label = \"lamindb\"\n                    entity_name = feature_registry\n                host_name = self._host.__class__.__name__\n                link_model_name = f\"{host_name}{entity_name}\"\n                link_model = apps.get_model(app_label, link_model_name)\n                filter_kwargs[host_name.lower()] = self._host\n                if value is not None:\n                    if not isinstance(value, SQLRecord):\n                        raise TypeError(\n                            f\"Expected a record for removing categorical feature value, \"\n                            f\"got {value} of type {type(value)}\"\n                        )\n                    assert not host_is_record, \"Only artifacts support passing a value.\"\n                    filter_kwargs[entity_name.lower()] = value\n                link_records = link_model.objects.filter(**filter_kwargs)\n                if not link_records.exists():\n                    value_msg = f\"with value {value!r} \" if value is not None else \"\"\n                    logger.warning(\n                        f\"no feature '{feature_record.name}' {value_msg}found on \"\n                        f\"{host_name.lower()} '{self._host.uid}'!\"\n                    )\n                    return\n                link_records.delete()\n            else:\n                if value is not None:\n                    filter_kwargs[\"value\"] = value\n                if host_is_record:\n                    feature_values = self._host.values_json.filter(**filter_kwargs)\n                else:\n                    feature_values = self._host.json_values.filter(**filter_kwargs)\n                if not feature_values.exists():\n                    logger.warning(\n                        f\"no feature '{feature_record.name}' {none_message}found on {self._host.__class__.__name__.lower()} '{self._host.uid}'!\"\n                    )\n                    return\n                if host_is_record:\n                    feature_values.delete(permanent=True)\n                else:\n                    # the below might leave a dangling feature_value record\n                    # but we don't want to pay the price of making another query just to remove this annotation\n                    # we can clean the JsonValue registry periodically if we want to\n                    self._host.json_values.remove(*feature_values)\n\n    def _add_schema(self, schema: Schema, slot: str) -> None:\n        \"\"\"Annotate artifact with a schema.\n\n        Args:\n            schema: `Schema` A schema record.\n            slot: `str` The slot that marks where the schema is stored in\n                the artifact.\n        \"\"\"\n        # TODO: deprecate as soon as we have the Schema-based curators\n        if self._host._state.adding:\n            raise ValueError(\n                \"Please save the artifact or collection before adding a feature set!\"\n            )\n        host_db = self._host._state.db\n        schema.save(using=host_db)\n        kwargs = {\n            \"artifact_id\": self._host.id,\n            \"schema\": schema,\n            \"slot\": slot,\n        }\n        link_record = (\n            self._host.schemas.through.objects.using(host_db)\n            .filter(**kwargs)\n            .one_or_none()\n        )\n        if link_record is None:\n            self._host.schemas.through(**kwargs).save(using=host_db)\n            if slot in self.slots:\n                logger.debug(f\"replaced existing {slot} feature set\")\n            self._slots[slot] = schema  # type: ignore\n\n    def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):\n        \"\"\"Transfer features from a artifact or collection.\"\"\"\n        # This only covers feature sets\n        if transfer_logs is None:\n            transfer_logs = {\"mapped\": [], \"transferred\": [], \"run\": None}\n        from lamindb import settings\n\n        using_key = settings._using_key\n        for slot, schema in data.features.slots.items():  # type: ignore\n            try:\n                members = schema.members\n            except ModuleWasntConfigured as err:\n                logger.warning(f\"skipping transfer of {slot} schema because {err}\")\n                continue\n            if len(members) == 0:\n                continue\n            if len(members) > settings.annotation.n_max_records:\n                logger.warning(\n                    f\"skipping creating {len(members)} > {settings.annotation.n_max_records} new {members[0].__class__.__name__} records\"\n                )\n                schema_self = schema\n                schema_exists = Schema.filter(hash=schema_self.hash).one_or_none()\n                if schema_exists is not None:\n                    schema_self = schema_exists\n                else:\n                    schema_self.save()\n            else:\n                registry = members[0].__class__\n                # note here the features are transferred based on an unique field\n                field = REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), \"uid\")\n                # this will be e.g. be a list of ontology_ids or uids\n                member_uids = list(members.values_list(field, flat=True))\n                validated = registry.validate(member_uids, field=field, mute=True)\n                new_members_uids = list(compress(member_uids, ~validated))\n                new_members = members.filter(**{f\"{field}__in\": new_members_uids})\n                n_new_members = len(new_members)\n                if len(members) > settings.annotation.n_max_records:\n                    logger.warning(\n                        f\"skipping creating {n_new_members} > {settings.annotation.n_max_records} new {registry.__name__} records\"\n                    )\n                if n_new_members > 0:\n                    # transfer foreign keys needs to be run before transfer to default db\n                    transfer_fk_to_default_db_bulk(\n                        new_members, using_key, transfer_logs=transfer_logs\n                    )\n                    for feature in new_members:\n                        # not calling save=True here as in labels, because want to\n                        # bulk save below\n                        # transfer_fk is set to False because they are already transferred\n                        # in the previous step transfer_fk_to_default_db_bulk\n                        transfer_to_default_db(\n                            feature,\n                            using_key,\n                            transfer_fk=False,\n                            transfer_logs=transfer_logs,\n                        )\n                    save(\n                        new_members, ignore_conflicts=True\n                    )  # conflicts arising from existing records are ignored\n\n                # create a new feature set from feature values using the same uid\n                schema_self = Schema.from_values(\n                    member_uids, field=getattr(registry, field)\n                )\n                if schema_self is None:\n                    if hasattr(registry, \"organism_id\"):\n                        logger.warning(\n                            f\"Schema is not transferred, check if organism is set correctly: {schema}\"\n                        )\n                    continue\n                # make sure the uid matches if schema is composed of same features\n                if schema_self.hash == schema.hash:\n                    schema_self.uid = schema.uid\n                logger.info(f\"saving {slot} schema: {schema_self}\")\n            try:\n                self._host.features._add_schema(schema_self, slot)\n            except IntegrityError:\n                logger.warning(\n                    f\"updating annotation of artifact {self._host.uid} with feature set for slot: {slot}\"\n                )\n                self._host.schemas.through.objects.get(\n                    artifact_id=self._host.id, slot=slot\n                ).delete()\n                self._host.features._add_schema(schema_self, slot)\n\n\ndef bulk_set_features_in_records(records: Iterable[Record]) -> None:\n    \"\"\"Bulk-set lazy feature dictionaries for records.\n\n    Intended for records created via `Record(features=...)` and persisted with\n    `ln.save([...])`.\n    \"\"\"\n    import pandas as pd\n\n    from lamindb.curators.core import DataFrameCurator\n\n    records_with_features = [\n        record\n        for record in records\n        if hasattr(record, \"_features\") and record._features is not None\n    ]\n    if len(records_with_features) == 0:\n        return None\n\n    batch_schema: Schema | None = None\n    prepared_records: list[\n        tuple[Record, FeatureManager, dict[str, Any], list[Feature], dict[str, Any]]\n    ] = []\n    prepared_rows: list[dict[str, Any]] = []\n    for record in records_with_features:\n        schema = None\n        if record.type is not None and record.type.schema is not None:\n            schema = record.type.schema\n        if schema is None:\n            raise ValidationError(\n                \"Bulk setting features in records requires all records to have the same non-null type schema.\"\n            )\n        if batch_schema is None:\n            batch_schema = schema\n        elif schema.id != batch_schema.id:\n            raise ValidationError(\n                \"Bulk setting features in records requires all records to have the same type schema.\"\n            )\n        manager = record.features\n        (\n            dictionary,\n            _,\n            explicit_features,\n            values_by_feature_uid,\n        ) = manager._resolve_feature_value_dictionary(record._features)\n        prepared_rows.append(dictionary)\n        prepared_records.append(\n            (record, manager, dictionary, explicit_features, values_by_feature_uid)\n        )\n\n    assert batch_schema is not None  # noqa: S101\n    schema_features = list(batch_schema.members.all())\n    dataframe = pd.DataFrame(prepared_rows)\n    for feature in schema_features:\n        if (\n            feature.name in dataframe\n            and feature.dtype_as_str.startswith(\"cat\")\n            and not feature.dtype_as_str.startswith(\"list[cat\")\n        ):\n            dataframe[feature.name] = dataframe[feature.name].astype(\"category\")\n    # Single-pass dataframe curation:\n    # validate schema and resolve categoricals once for the entire batch.\n    #\n    # The resolved label records are then reused below when creating per-record\n    # link rows, avoiding repeated registry calls for each row.\n    curator = DataFrameCurator(dataframe, batch_schema)\n    curator.validate()\n\n    members_by_name: dict[str, list[Feature]] = defaultdict(list)\n    schema_member_ids: set[int] = set()\n    resolved_records_by_feature_id: dict[int, dict[Any, list[SQLRecord]]] = {}\n    for feature in schema_features:\n        members_by_name[feature.name].append(feature)\n        schema_member_ids.add(feature.id)\n        if not (\n            feature.dtype_as_str.startswith(\"cat\")\n            or feature.dtype_as_str.startswith(\"list[cat\")\n        ):\n            continue\n        cat_vector = curator.cat._cat_vectors.get(feature.name)\n        if cat_vector is None or cat_vector.records is None:\n            continue\n        # Build lookup cache:\n        #   feature.id -> raw value -> [resolved label records]\n        #\n        # We intentionally keep a list of records per value to support\n        # list-categorical and potential multi-match cases consistently with\n        # existing link creation semantics.\n        cache_for_feature: dict[Any, list[SQLRecord]] = defaultdict(list)\n        for label_record in cat_vector.records:\n            key = getattr(label_record, cat_vector._field_name)\n            normalized_key = key.item() if isinstance(key, np.generic) else key\n            cache_for_feature[normalized_key].append(label_record)\n        resolved_records_by_feature_id[feature.id] = dict(cache_for_feature)\n\n    feature_json_values: list[SQLRecord] = []\n    links_by_model: dict[type[SQLRecord], list[SQLRecord]] = defaultdict(list)\n    not_validated_values: dict[str, tuple[str, list[str]]] = {}\n    for (\n        record,\n        manager,\n        dictionary,\n        explicit_features,\n        values_by_feature_uid,\n    ) in prepared_records:\n        keys = list(dictionary.keys())\n        features_not_in_schema = [\n            feature.name\n            for feature in explicit_features\n            if feature.id not in schema_member_ids\n        ]\n        if features_not_in_schema:\n            raise ValidationError(\n                \"These feature keys are not in the provided schema: \"\n                f\"{features_not_in_schema}\"\n            )\n        looked_up_features = [\n            feature for key in keys for feature in members_by_name.get(key, [])\n        ]\n        feature_objects = manager._merge_feature_objects(\n            explicit_features, looked_up_features\n        )\n        manager._collect_record_feature_writes(\n            record=record,\n            feature_objects=feature_objects,\n            dictionary=dictionary,\n            values_by_feature_uid=values_by_feature_uid,\n            feature_json_values=feature_json_values,\n            links_by_model=links_by_model,\n            not_validated_values=not_validated_values,\n            resolved_records_by_feature_id=resolved_records_by_feature_id,\n        )\n    FeatureManager._raise_not_validated_values(not_validated_values)\n    if feature_json_values:\n        save(feature_json_values)\n    for links in links_by_model.values():\n        try:\n            save(links, ignore_conflicts=False)\n        except Exception:\n            save(links, ignore_conflicts=True)\n    for record in records_with_features:\n        del record._features\n    return None\n"
  },
  {
    "path": "lamindb/models/_from_values.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom lamin_utils import colors, logger\n\nif TYPE_CHECKING:\n    from pandas import DataFrame, Index\n\n    from lamindb.base.types import FieldAttr, ListLike\n\n    from .query_set import SQLRecordList\n    from .sqlrecord import SQLRecord\n\n\n# The base function for `from_values`\ndef _from_values(\n    iterable: ListLike,\n    field: FieldAttr,\n    *,\n    create: bool = False,\n    organism: SQLRecord | str | None = None,\n    source: SQLRecord | None = None,\n    standardize: bool = True,\n    from_source: bool = True,\n    mute: bool = False,\n    **filter_kwargs,\n) -> SQLRecordList:\n    \"\"\"Get or create records from iterables.\"\"\"\n    from .query_set import SQLRecordList\n\n    registry = field.field.model  # type: ignore\n    organism_record = get_organism_record_from_field(field, organism, values=iterable)\n    # TODO: the create is problematic if field is not a name field\n    if create:\n        create_kwargs = {}\n        if organism_record:\n            create_kwargs[\"organism\"] = organism_record\n        return SQLRecordList(\n            [\n                registry(**{field.field.name: value}, **create_kwargs)\n                for value in iterable\n            ]\n        )  # type: ignore\n\n    iterable_idx = index_iterable(iterable)\n\n    # returns existing records & non-existing values\n    records, nonexist_values, msg = get_existing_records(\n        iterable_idx=iterable_idx,\n        field=field,\n        organism=organism_record,\n        mute=mute,\n        **filter_kwargs,\n    )\n\n    # new records to be created based on new values\n    if len(nonexist_values) > 0:\n        if from_source and registry.__base__.__name__ == \"BioRecord\":\n            # if can and needed, get organism record from the existing records\n            if (\n                organism_record is None\n                and len(records) > 0\n                and registry.require_organism()\n            ):\n                organism_record = records[0].organism\n            records_public, unmapped_values = create_records_from_source(\n                iterable_idx=nonexist_values,\n                field=field,\n                organism=organism_record,\n                source=source,\n                standardize=standardize,\n                msg=msg,\n                mute=mute,\n            )\n            if len(records_public) > 0:\n                msg = \"\"\n            for record in records_public:\n                record._from_source = True\n            records += records_public\n        else:\n            unmapped_values = nonexist_values\n        # unmapped new_ids will NOT create records\n        if len(unmapped_values) > 0:\n            # first log the success message\n            if len(msg) > 0 and not mute:\n                logger.success(msg)\n            s = \"\" if len(unmapped_values) == 1 else \"s\"\n            print_values = colors.yellow(_format_values(unmapped_values))\n            n_nonval = colors.yellow(f\"{len(unmapped_values)} non-validated\")\n            if not mute:\n                logger.info(\n                    f\"{colors.red('did not create')} {registry.__name__} record{s} for \"\n                    f\"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}\"  # type: ignore\n                )\n    return SQLRecordList(records)\n\n\ndef get_existing_records(\n    iterable_idx: Index,\n    field: FieldAttr,\n    organism: SQLRecord | None = None,\n    standardize: bool = True,\n    mute: bool = False,\n    **filter_kwargs,\n) -> tuple[list, Index, str]:\n    \"\"\"Get existing records from the database.\"\"\"\n    import pandas as pd\n\n    from .can_curate import _validate\n\n    # NOTE: existing records matching is agnostic to the source\n    registry = field.field.model  # type: ignore\n    queryset = registry.filter(**filter_kwargs)\n\n    if standardize:\n        # log synonyms mapped terms\n        if hasattr(registry, \"standardize\"):\n            syn_mapper = queryset.standardize(\n                iterable_idx,\n                field=field,\n                organism=organism,\n                mute=True,\n                from_source=False,  # standardize only based on the DB reference\n                return_mapper=True,\n            )\n            iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index\n    else:\n        syn_mapper = {}\n\n    # now we have to sort the list of queried records\n    # preserved = Case(\n    #     *[\n    #         When(**{field.field.name: value}, then=pos)\n    #         for pos, value in enumerate(iterable_idx)\n    #     ]\n    # )\n    # order by causes a factor 10 in runtime\n    # records = query_set.order_by(preserved).to_list()\n\n    # log validated terms\n    is_validated = _validate(\n        cls=queryset, values=iterable_idx, field=field, organism=organism, mute=True\n    )\n    if len(is_validated) > 0:\n        validated = iterable_idx[is_validated]\n    else:\n        validated = []\n    msg = \"\"\n    syn_msg = \"\"\n    if not mute:\n        if len(validated) > 0:\n            s = \"\" if len(validated) == 1 else \"s\"\n            print_values = colors.green(_format_values(validated))\n            msg = (\n                \"loaded\"\n                f\" {colors.green(f'{len(validated)} {registry.__name__} record{s}')}\"\n                f\" matching {colors.italic(f'{field.field.name}')}: {print_values}\"\n            )\n        if len(syn_mapper) > 0:\n            s = \"\" if len(syn_mapper) == 1 else \"s\"\n            names = list(syn_mapper.keys())\n            print_values = colors.green(_format_values(names))\n            syn_msg = (\n                \"loaded\"\n                f\" {colors.green(f'{len(syn_mapper)} {registry.__name__} record{s}')}\"\n                f\" matching {colors.italic('synonyms')}: {print_values}\"\n            )\n\n    # no logging if all values are validated\n    # logs if there are synonyms\n    if len(syn_msg) > 0:\n        if len(msg) > 0 and not mute:\n            logger.success(msg)\n        if not mute:\n            logger.success(syn_msg)\n        msg = \"\"\n\n    # get all existing records in the db\n    query = {f\"{field.field.name}__in\": iterable_idx.values}  # type: ignore\n    if organism is not None:\n        query[\"organism\"] = organism\n    records = queryset.filter(**query).to_list()\n\n    if len(validated) == len(iterable_idx):\n        return records, pd.Index([]), msg\n    else:\n        nonval_values = iterable_idx.difference(validated)\n        return records, nonval_values, msg\n\n\ndef create_records_from_source(\n    iterable_idx: Index,\n    field: FieldAttr,\n    organism: SQLRecord | None = None,\n    source: SQLRecord | None = None,\n    standardize: bool = True,\n    msg: str = \"\",\n    mute: bool = False,\n) -> tuple[list, Index]:\n    \"\"\"Create records from source.\"\"\"\n    registry = field.field.model  # type: ignore\n    records: list = []\n    # populate additional fields from public_df\n    from bionty._organism import OrganismNotSet\n    from bionty._source import filter_public_df_columns, get_source_record\n\n    # get the default source\n    if organism is None and registry.require_organism(field=field):\n        raise OrganismNotSet(\n            f\"`organism` is required to create new {registry.__name__} records from source!\"\n        )\n    try:\n        source_record = get_source_record(registry, organism, source)\n    except ValueError:\n        # no source found\n        return records, iterable_idx\n\n    # create the corresponding PublicOntology object from registry\n    try:\n        public_ontology = registry.public(source=source_record)\n    except Exception:\n        # no public source\n        return records, iterable_idx\n\n    # filter the columns in public df based on fields\n    public_df = filter_public_df_columns(\n        registry=registry, public_ontology=public_ontology\n    )\n\n    if public_df.empty:\n        return records, iterable_idx\n\n    # standardize in the public reference\n    # do not inspect synonyms if the field is not name field\n    result = public_ontology.inspect(\n        iterable_idx,\n        field=field.field.name,  # type: ignore\n        standardize=False\n        if hasattr(registry, \"_name_field\") and field.field.name != registry._name_field\n        else standardize,  # type: ignore\n        mute=True,\n    )\n    syn_mapper = result.synonyms_mapper\n\n    msg_syn: str = \"\"\n    if len(syn_mapper) > 0:\n        s = \"\" if len(syn_mapper) == 1 else \"s\"\n        names = list(syn_mapper.keys())\n        print_values = colors.purple(_format_values(names))\n        msg_syn = (\n            \"created\"\n            f\" {colors.purple(f'{len(syn_mapper)} {registry.__name__} record{s} from Bionty')}\"\n            f\" matching {colors.italic('synonyms')}: {print_values}\"\n        )\n\n        iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index\n\n    # create records for values that are found in the public reference\n    # matching either field or synonyms\n    mapped_values = iterable_idx.intersection(public_df[field.field.name])  # type: ignore\n\n    multi_msg = \"\"\n    if len(mapped_values) > 0:\n        public_kwargs, multi_msg = _bulk_create_dicts_from_df(\n            keys=mapped_values,\n            column_name=field.field.name,  # type: ignore\n            df=public_df,\n        )\n\n        create_kwargs = (\n            {\"organism\": organism, \"source\": source_record}\n            if organism is not None\n            else {\"source\": source_record}\n        )\n        for bk in public_kwargs:\n            # skip validation to speed up bulk creation since the values don't validate in the registry DB yet\n            records.append(registry(**bk, **create_kwargs, _skip_validation=True))\n\n        # number of records that matches field (not synonyms)\n        validated = result.validated\n        if len(validated) > 0:\n            s = \"\" if len(validated) == 1 else \"s\"\n            print_values = colors.purple(_format_values(validated))\n            # this is the success msg for existing records in the DB from get_existing_records\n            if len(msg) > 0 and not mute:\n                logger.success(msg)\n            if not mute:\n                logger.success(\n                    \"created\"\n                    f\" {colors.purple(f'{len(validated)} {registry.__name__} record{s} from Bionty')}\"\n                    f\" matching {colors.italic(f'{field.field.name}')}: {print_values}\"  # type: ignore\n                )\n\n    # make sure that synonyms logging appears after the field logging\n    if len(msg_syn) > 0 and not mute:\n        logger.success(msg_syn)\n    # warning about multi matches\n    if len(multi_msg) > 0 and not mute:\n        logger.warning(multi_msg)\n\n    # return the values that are not found in the public reference\n    unmapped_values = iterable_idx.difference(mapped_values)\n    return records, unmapped_values\n\n\ndef index_iterable(iterable: ListLike) -> Index:\n    \"\"\"Get unique values from an iterable.\"\"\"\n    import pandas as pd\n\n    idx = pd.Index(iterable).unique()\n    # No entries are made for NAs, '', None\n    # returns an ordered unique not null list\n    return idx[(idx != \"\") & (~idx.isnull())]\n\n\ndef _format_values(\n    names: ListLike, n: int = 20, quotes: bool = True, sep: str = \"'\"\n) -> str:\n    \"\"\"Format values for printing.\"\"\"\n    items = {str(name): None for name in names if name != \"None\"}\n\n    unique_items = list(items.keys())\n\n    if quotes:\n        unique_items = [f\"{sep}{item}{sep}\" for item in unique_items]\n\n    print_values = \", \".join(unique_items[:n])\n\n    if len(unique_items) > n:\n        print_values += \", ...\"\n\n    return print_values\n\n\ndef _bulk_create_dicts_from_df(\n    keys: set | list, column_name: str, df: DataFrame\n) -> tuple[dict, str]:\n    \"\"\"Get fields from a DataFrame for many rows.\"\"\"\n    multi_msg = \"\"\n    if df.index.name != column_name:\n        df = df.set_index(column_name).loc[list(keys)]\n    if not df.index.is_unique:\n        # return all records for multi-matches with a warning\n        dup = df.index[df.index.duplicated()].unique().tolist()\n        if len(dup) > 0:\n            s = \"\" if len(dup) == 1 else \"s\"\n            print_values = _format_values(dup)\n            multi_msg = (\n                f\"ambiguous validation in Bionty for {len(dup)} record{s}:\"\n                f\" {print_values}\"\n            )\n\n    return df.reset_index().to_dict(orient=\"records\"), multi_msg\n\n\ndef get_organism_record_from_field(  # type: ignore\n    field: FieldAttr,\n    organism: str | SQLRecord | None = None,\n    values: ListLike = None,\n    using_key: str | None = None,\n) -> SQLRecord | None:\n    \"\"\"Get organism record based on which field is used in from_values.\n\n    Args:\n        field: the field of the registry for from_values\n        organism: the organism to get the organism record for\n        values: the values passed to from_values\n        using_key: the db to get the organism record from\n\n    Returns:\n        The organism record if both conditions are met:\n            The organism FK is required for the registry\n            The field is not unique (e.g. Gene.symbol) or the organism is not None\n    \"\"\"\n    registry = field.field.model\n    if registry.__base__.__name__ != \"BioRecord\":\n        return None\n\n    from bionty._organism import (\n        create_or_get_organism_record,\n        infer_organism_from_ensembl_id,\n    )\n\n    if values is None:\n        values = []\n\n    # if the field is bionty.Gene.ensembl_gene_id, infer organism from ensembl id\n    if (\n        registry.__get_name_with_module__() == \"bionty.Gene\"\n        and field.field.name == \"ensembl_gene_id\"\n        and len(values) > 0\n        and organism is None\n    ):\n        # Check if values contain bionty.Gene objects with organism field\n        from collections.abc import Iterable\n\n        # first check if we have Gene objects\n        for v in values:\n            # early return to not loop through all values to find a string\n            if isinstance(v, str):\n                break\n            if isinstance(v, registry) and v.organism is not None:\n                return v.organism\n            # Handle iterables containing Gene objects (but not strings, which are also iterable)\n            elif isinstance(v, Iterable) and not isinstance(v, str):\n                for item in v:\n                    if isinstance(item, registry) and item.organism is not None:\n                        return item.organism\n\n        # If no bionty.Gene with organism found, fall back to string-based inference\n        # pass the first ensembl id that starts with ENS to infer organism\n        first_ensembl = next(\n            (v for v in values if isinstance(v, str) and v.startswith(\"ENS\")), \"\"\n        )\n        if first_ensembl:\n            return infer_organism_from_ensembl_id(first_ensembl, using_key)\n\n    return create_or_get_organism_record(\n        organism=organism, registry=registry, field=field\n    )\n"
  },
  {
    "path": "lamindb/models/_is_versioned.py",
    "content": "from __future__ import annotations\n\nfrom pathlib import PurePosixPath\nfrom typing import TYPE_CHECKING, Any, Iterable, Literal\n\nfrom django.db import models\nfrom django.db.models import Q\nfrom lamin_utils import logger\nfrom lamin_utils._base62 import increment_base62\n\nfrom lamindb.base import uids\nfrom lamindb.base.fields import (\n    BooleanField,\n    CharField,\n)\n\nif TYPE_CHECKING:  # noqa\n    from lamindb.models.query_set import QuerySet\n\n\nclass IsVersioned(models.Model):\n    \"\"\"Base class for versioned models.\"\"\"\n\n    class Meta:\n        abstract = True\n\n    _len_stem_uid: int\n\n    version_tag: str | None = CharField(max_length=30, null=True, db_index=True)\n    \"\"\"Version tag (default `None`).\n\n    Consider using `semantic versioning <https://semver.org>`__\n    with `Python versioning <https://peps.python.org/pep-0440/>`__.\n    \"\"\"\n    is_latest: bool = BooleanField(default=True, db_index=True)\n    \"\"\"Boolean flag that indicates whether a record is the latest in its version family.\"\"\"\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        self._revises = kwargs.pop(\"revises\", None)\n        super().__init__(*args, **kwargs)\n\n    @property\n    def stem_uid(self) -> str:\n        \"\"\"Universal id characterizing the version family.\n\n        The full uid of a record is obtained via concatenating the stem uid and version information::\n\n            stem_uid = random_base62(n_char)  # a random base62 sequence of length 12 (transform) or 16 (artifact, collection)\n            version_uid = \"0000\"  # an auto-incrementing 4-digit base62 number\n            uid = f\"{stem_uid}{version_uid}\"  # concatenate the stem_uid & version_uid\n\n        \"\"\"\n        return self.uid[: self._len_stem_uid]  # type: ignore\n\n    @property\n    def version(self) -> str:\n        \"\"\"The version of an object.\n\n        Defines version of an object within a family of objects characterized by the same `stem_uid`.\n\n        Returns `.version_tag` if set, otherwise the last 4 characters of the `uid`.\n        \"\"\"\n        return self.version_tag if self.version_tag else self.uid[-4:]  # type: ignore\n\n    @version.setter\n    def version(self, value: str | None) -> None:\n        self.version_tag = value\n\n    @property\n    def versions(self) -> QuerySet:\n        \"\"\"Lists all records of the same version family.\n\n        Example::\n\n            artifact.versions.to_dataframe()       # all versions of the artifact in a dataframe\n            artifact.versions.get(is_latest=True)  # the latest version of the artifact\n        \"\"\"\n        return (\n            self.__class__.connect(self._state.db)\n            .filter(uid__startswith=self.stem_uid)\n            .order_by(\"-created_at\")\n        )\n\n    def _add_to_version_family(\n        self, revises: IsVersioned, version_tag: str | None = None\n    ):\n        \"\"\"Add current record to a version family.\n\n        Args:\n            revises: a record that belongs to the version family.\n            version_tag: semantic version tag of the record.\n        \"\"\"\n        old_uid = self.uid  # type: ignore\n        new_uid, revises = create_uid(revises=revises, version_tag=version_tag)\n        if (\n            self.__class__.__name__ == \"Artifact\"\n            and self._real_key is None\n            and (self._key_is_virtual or self.key is None)\n        ):\n            from lamindb.core.storage.paths import auto_storage_key_from_artifact_uid\n\n            old_path = self.path\n            new_storage_key = auto_storage_key_from_artifact_uid(\n                new_uid, self.suffix, self._overwrite_versions\n            )\n            new_path = old_path.rename(\n                old_path.with_name(PurePosixPath(new_storage_key).name)\n            )\n            logger.success(f\"updated path from {old_path} to {new_path}!\")\n        self.uid = new_uid\n        self.version_tag = version_tag\n        self.save()\n        logger.success(f\"updated uid from {old_uid} to {new_uid}!\")\n\n\ndef bump_version(\n    version: str,\n    bump_type: str = \"minor\",\n    behavior: Literal[\"prompt\", \"error\", \"ignore\"] = \"error\",\n) -> str:\n    \"\"\"Bumps the version number by major or minor depending on the bump_type flag.\n\n    Args:\n        version: The current version in \"MAJOR\" or \"MAJOR.MINOR\" format.\n        bump_type: The type of version bump, either 'major' or 'minor'.\n\n    Returns:\n        The new version string.\n    \"\"\"\n    try:\n        # Split the version into major and minor parts if possible\n        parts = version.split(\".\")\n        major = int(parts[0])\n        minor = int(parts[1]) if len(parts) > 1 else 0\n\n        if bump_type == \"major\":\n            # Bump the major version and reset the minor version\n            new_version = f\"{major + 1}\"\n        elif bump_type == \"minor\":\n            # Bump the minor version\n            new_version = f\"{major}.{minor + 1}\"\n        else:\n            raise ValueError(\"bump_type must be 'major' or 'minor'\")\n\n    except (ValueError, IndexError):\n        if behavior == \"prompt\":\n            new_version = input(\n                f\"The current version is '{version}' - please type the new version: \"\n            )\n        elif behavior == \"error\":\n            raise ValueError(\n                \"Cannot auto-increment non-integer castable version, please provide\"\n                \" manually\"\n            ) from None\n        else:\n            logger.warning(\"could not auto-increment version, fix '?' manually\")\n            new_version = \"?\"\n    return new_version\n\n\ndef set_version(version: str | None = None, previous_version: str | None = None):\n    \"\"\"(Auto-) set version.\n\n    If `version` is `None`, returns the stored version.\n    Otherwise sets the version to the passed version.\n\n    Args:\n        version: Version string.\n        previous_version: Previous version string.\n    \"\"\"\n    if version is None and previous_version is not None:\n        version = bump_version(previous_version, bump_type=\"major\")\n    return version\n\n\ndef create_uid(\n    *,\n    version_tag: str | None = None,\n    n_full_id: int = 20,\n    revises: IsVersioned | None = None,\n) -> tuple[str, IsVersioned | None]:\n    \"\"\"This also updates revises in case it's not the latest version.\n\n    This is why it returns revises.\n    \"\"\"\n    if revises is not None:\n        latest_in_family = (\n            revises.__class__.objects.filter(uid__startswith=revises.stem_uid)\n            .order_by(\"uid\")\n            .last()\n        )\n        if latest_in_family is not None and latest_in_family.uid != revises.uid:\n            revises = latest_in_family\n            logger.warning(\n                f\"didn't pass the latest version in `revises`, retrieved it: {revises}\"\n            )\n        suid = revises.stem_uid\n        vuid = increment_base62(revises.uid[-4:])  # type: ignore\n    else:\n        suid = uids.base62(n_full_id - 4)\n        vuid = \"0000\"\n    if version_tag is not None:\n        if not isinstance(version_tag, str):\n            raise ValueError(\n                \"`version` parameter must be `None` or `str`, e.g., '0.1', '1', '2', etc.\"\n            )\n        if revises is not None:\n            if version_tag == revises.version_tag:\n                raise ValueError(\n                    f\"Please change the version tag or leave it `None`, '{revises.version_tag}' is already taken\"\n                )\n    return suid + vuid, revises\n\n\ndef process_revises(\n    revises: IsVersioned | None,\n    version_tag: str | None,\n    key: str | None,\n    description: str | None,\n    type: type[IsVersioned],\n) -> tuple[str, str, str, str, IsVersioned | None]:\n    if revises is not None and not isinstance(revises, type):\n        raise TypeError(f\"`revises` has to be of type `{type.__name__}`\")\n    uid, revises = create_uid(\n        revises=revises, version_tag=version_tag, n_full_id=type._len_full_uid\n    )\n    if revises is not None:\n        if description is None:\n            description = getattr(revises, \"description\", None)\n        if key is None:\n            key = revises.key\n    return uid, version_tag, key, description, revises\n\n\ndef _adjust_is_latest_when_deleting_is_versioned(\n    objects: IsVersioned | Iterable[IsVersioned],\n) -> list[int]:\n    \"\"\"After deleting (soft or permanent) versioned records, promote new latest per version family.\n\n    Accepts a single IsVersioned instance, a QuerySet, or a list of IsVersioned.\n    Runs in 1 query (candidates + update) when objects are passed; no extra query for uids.\n    Returns the list of pks that were promoted to is_latest (for testing).\n    \"\"\"\n    if isinstance(objects, IsVersioned):\n        objects = [objects]\n    else:\n        objects = list(objects)\n    if not objects:\n        return []\n    id_list = [o.pk for o in objects]\n    stem_uids = list({o.uid[: o._len_stem_uid] for o in objects if o.is_latest})\n    if not stem_uids:\n        return []\n    registry = type(objects[0])\n    db = getattr(objects[0]._state, \"db\", None) or \"default\"\n    len_stem = registry._len_stem_uid\n    # All candidates: same family as any stem_uid, not in trash and not about to be deleted\n    q = Q()\n    for s in stem_uids:\n        q |= Q(uid__startswith=s)\n    qs = registry.objects.using(db).filter(q).exclude(pk__in=id_list)\n    from .sqlrecord import SQLRecord\n\n    if issubclass(registry, SQLRecord):\n        qs = qs.exclude(branch_id=-1)\n    candidates = list(qs.values(\"pk\", \"uid\", \"created_at\"))\n    # per stem_uid, pick candidate with max created_at\n    by_stem: dict[str, dict[str, Any]] = {}\n    for c in candidates:\n        stem = c[\"uid\"][:len_stem]\n        if stem not in by_stem or c[\"created_at\"] > by_stem[stem][\"created_at\"]:\n            by_stem[stem] = c\n    if not by_stem:\n        return []\n    pks = [by_stem[s][\"pk\"] for s in by_stem]\n    registry.objects.using(db).filter(pk__in=pks).update(is_latest=True)\n    if pks:\n        promoted_uids = [by_stem[s][\"uid\"] for s in by_stem]\n        if len(promoted_uids) == 1:\n            logger.important_hint(\n                f\"new latest {registry.__name__} version is: {promoted_uids[0]}\"\n            )\n        else:\n            logger.important_hint(\n                f\"new latest {registry.__name__} versions: {promoted_uids}\"\n            )\n    return pks\n\n\ndef reconcile_is_latest_within_branch(\n    registry: type[IsVersioned],\n    *,\n    branch_id: int,\n    db: str = \"default\",\n) -> int:\n    \"\"\"Keep a single is_latest=True per version family in a branch.\n\n    Winner selection is based on newest created_at, tie-broken by highest pk.\n    Returns the number of records demoted from is_latest=True to False.\n    \"\"\"\n    len_stem = registry._len_stem_uid\n    latest_records = list(\n        registry.objects.using(db)\n        .filter(branch_id=branch_id, is_latest=True)\n        .values(\"pk\", \"uid\", \"created_at\")\n        .order_by(\"uid\", \"created_at\", \"pk\")\n    )\n    if not latest_records:\n        return 0\n    winners_by_stem: dict[str, dict[str, Any]] = {}\n    losers: list[int] = []\n    for record in latest_records:\n        stem = record[\"uid\"][:len_stem]\n        winner = winners_by_stem.get(stem)\n        if winner is None:\n            winners_by_stem[stem] = record\n            continue\n        if (record[\"created_at\"], record[\"pk\"]) > (winner[\"created_at\"], winner[\"pk\"]):\n            losers.append(winner[\"pk\"])\n            winners_by_stem[stem] = record\n        else:\n            losers.append(record[\"pk\"])\n    if not losers:\n        return 0\n    return registry.objects.using(db).filter(pk__in=losers).update(is_latest=False)\n"
  },
  {
    "path": "lamindb/models/_label_manager.py",
    "content": "from __future__ import annotations\n\nfrom collections import defaultdict\nfrom typing import TYPE_CHECKING\n\nfrom django.db import connections\nfrom rich.table import Column, Table\nfrom rich.text import Text\nfrom rich.tree import Tree\n\nfrom lamindb.models import CanCurate, Feature\nfrom lamindb.models._from_values import _format_values\nfrom lamindb.models.save import save\nfrom lamindb.models.sqlrecord import (\n    REGISTRY_UNIQUE_FIELD,\n    get_name_field,\n    transfer_fk_to_default_db_bulk,\n    transfer_to_default_db,\n)\n\nfrom ._describe import (\n    NAME_WIDTH,\n    TYPE_WIDTH,\n    VALUES_WIDTH,\n    format_rich_tree,\n)\nfrom ._django import get_artifact_or_run_with_related, get_related_model\nfrom ._relations import dict_related_model_to_related_name\n\nif TYPE_CHECKING:\n    from lamindb.models import Artifact, Collection, SQLRecord\n    from lamindb.models.query_set import QuerySet\n\nEXCLUDE_LABELS = {\"schemas\"}\n\n\ndef _get_labels(\n    obj, links: bool = False, instance: str | None = None\n) -> dict[str, QuerySet]:\n    \"\"\"Get all labels associated with an object as a dictionary.\n\n    This is a generic approach that uses django orm.\n    \"\"\"\n    if obj.id is None:\n        return {}\n\n    labels = {}\n    related_models = dict_related_model_to_related_name(\n        obj.__class__, links=links, instance=instance\n    )\n    if obj.__class__.__name__ == \"Artifact\" and links:\n        related_models[\"ArtifactArtifact\"] = \"links_artifact\"\n    for _, related_name in related_models.items():\n        if (\n            related_name not in EXCLUDE_LABELS\n            and not related_name.startswith(\"_\")\n            and not related_name == \"json_values\"\n        ):\n            labels[related_name] = getattr(obj, related_name).all()\n    return labels\n\n\ndef _get_labels_postgres(\n    self: Artifact | Collection, m2m_data: dict | None = None\n) -> dict[str, dict[int, str]]:\n    \"\"\"Get all labels associated with an artifact or collection as a dictionary.\n\n    This is a postgres-specific approach that uses django Subquery.\n    \"\"\"\n    if m2m_data is None:\n        artifact_meta = get_artifact_or_run_with_related(self, include_m2m=True)\n        m2m_data = artifact_meta.get(\"related_data\", {}).get(\"m2m\", {})\n    return m2m_data\n\n\ndef describe_labels(\n    self: Artifact | Collection,\n    related_data: dict | None = None,\n) -> Tree | None:\n    \"\"\"Describe labels.\"\"\"\n    labels_data = related_data.get(\"m2m\") if related_data is not None else None\n    if labels_data is None:\n        if (\n            not self._state.adding\n            and connections[self._state.db].vendor == \"postgresql\"\n        ):\n            labels_data = _get_labels_postgres(self, labels_data)\n        if not labels_data:\n            labels_data = _get_labels(self, instance=self._state.db)\n    if not labels_data:\n        return None\n    labels_table = Table(\n        Column(\"\", style=\"\", no_wrap=True, width=NAME_WIDTH),\n        Column(\"\", style=\"dim\", no_wrap=True, width=TYPE_WIDTH),\n        Column(\"\", width=VALUES_WIDTH, no_wrap=True),\n        show_header=False,\n        box=None,\n        pad_edge=False,\n    )\n    for related_name, labels in labels_data.items():\n        if not labels or related_name == \"schemas\":\n            continue\n        if isinstance(labels, dict):\n            displays = [\n                d[key]\n                for d in labels.values()\n                for key in d.keys()\n                if key.endswith(\"_display\")\n            ]\n            print_values = _format_values(displays, n=10, quotes=False)\n        else:  # labels are a QuerySet\n            field = get_name_field(labels)\n            print_values = _format_values(\n                labels.values_list(field, flat=True), n=10, quotes=False\n            )\n        if print_values:\n            related_model = get_related_model(self, related_name)\n            type_str = related_model.__get_name_with_module__()\n            labels_table.add_row(\n                f\".{related_name}\", Text(type_str, style=\"dim\"), print_values\n            )\n    tree = None\n    if labels_table.rows:  # we might not have rows even if labels_data was non-empty\n        tree = Tree(Text(\"Labels\", style=\"bold green_yellow\"), guide_style=\"dim\")\n        tree.add(labels_table)\n    return tree\n\n\ndef _save_validated_records(\n    labels: QuerySet | list | dict,\n) -> list[str]:\n    \"\"\"Save validated records from public based on ontology_id_fields.\"\"\"\n    if not labels:\n        return []\n    registry = labels[0].__class__\n    field = (\n        REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), \"uid\")\n        if not hasattr(registry, \"_ontology_id_field\")\n        else registry._ontology_id_field\n    )\n    # if the field value is None, use uid field\n    label_uids = [getattr(label, field) for label in labels if label is not None]\n    # save labels from ontology_ids\n    if hasattr(registry, \"_ontology_id_field\") and label_uids:\n        try:\n            records = registry.from_values(label_uids, field=field, mute=True)\n            save([r for r in records if r._state.adding])\n        except Exception:  # noqa: S110\n            pass\n        field = \"uid\"\n        label_uids = [label.uid for label in labels if label is not None]\n\n    if issubclass(registry, CanCurate):\n        validated = registry.validate(label_uids, field=field, mute=True)\n        new_labels = [\n            label for label, is_valid in zip(labels, validated) if not is_valid\n        ]\n        return new_labels\n    return list(labels)\n\n\ndef save_validated_records(\n    records: QuerySet | list | dict,\n) -> list[str] | dict[str, list[str]]:\n    \"\"\"Save validated records from public based on ontology_id_fields.\"\"\"\n    if isinstance(records, dict):\n        return {\n            registry: _save_validated_records(registry_records)\n            for registry, registry_records in records.items()\n        }\n    return _save_validated_records(records)\n\n\nclass LabelManager:\n    \"\"\"Label manager.\n\n    This allows to manage untyped labels :class:`~lamindb.ULabel` and arbitrary\n    typed labels (e.g., :class:`~bionty.CellLine`) and associate labels\n    with features.\n    \"\"\"\n\n    def __init__(self, sqlrecord: Artifact | Collection) -> None:\n        # host is the sqlrecord that the label manager is attached to\n        # we might rename _host to _sqlrecord in the future\n        self._host = sqlrecord\n\n    def __repr__(self) -> str:\n        return self.describe(return_str=True)\n\n    def describe(self, return_str=True) -> str:\n        \"\"\"Describe the labels.\"\"\"\n        tree = describe_labels(self._host)\n        return format_rich_tree(tree, return_str=return_str)\n\n    def add(\n        self,\n        records: SQLRecord | list[SQLRecord] | QuerySet,\n        feature: Feature | None = None,\n    ) -> None:\n        \"\"\"Add one or several labels and associate them with a feature.\n\n        Args:\n            records: Label records to add.\n            feature: Feature under which to group the labels.\n        \"\"\"\n        from .artifact import add_labels\n\n        return add_labels(self._host, records=records, feature=feature)\n\n    def get(\n        self,\n        feature: Feature,\n        mute: bool = False,\n        flat_names: bool = False,\n    ) -> QuerySet | dict[str, QuerySet] | list:\n        \"\"\"Get labels given a feature.\n\n        Args:\n            feature: Feature under which labels are grouped.\n            mute: Show no logging.\n            flat_names: Flatten list to names rather than returning records.\n        \"\"\"\n        from .artifact import get_labels\n\n        return get_labels(self._host, feature=feature, mute=mute, flat_names=flat_names)\n\n    def add_from(self, data: Artifact | Collection, transfer_logs: dict = None) -> None:\n        \"\"\"Add labels from an artifact or collection to another artifact or collection.\n\n        Examples:\n\n            ::\n\n                artifact1 = ln.Artifact(pd.DataFrame(index=[0, 1])).save()\n                artifact2 = ln.Artifact(pd.DataFrame(index=[2, 3])).save()\n                records = ln.ULabel.from_values([\"Label1\", \"Label2\"], field=\"name\").save()\n                labels = ln.ULabel.filter(name__icontains = \"label\")\n                artifact1.ulabels.set(labels)  # using the ManyToMany relationship `.ulabels`\n                artifact2.labels.add_from(artifact1)  # using the `.labels` accessor that understands any label type\n        \"\"\"\n        if transfer_logs is None:\n            transfer_logs = {\"mapped\": [], \"transferred\": [], \"run\": None}\n        from lamindb import settings\n\n        using_key = settings._using_key\n        for related_name, labels in _get_labels(data, instance=data._state.db).items():\n            labels = labels.all()\n            if not labels.exists():\n                continue\n            # look for features\n            data_name_lower = data.__class__.__name__.lower()\n            labels_by_features: dict = defaultdict(list)\n            features = set()\n            new_labels = save_validated_records(labels)\n            if len(new_labels) > 0:\n                transfer_fk_to_default_db_bulk(\n                    new_labels, using_key, transfer_logs=transfer_logs\n                )\n            for label in labels:\n                keys: list = []\n                # if the link table doesn't follow this convention, we'll ignore it\n                if not hasattr(label, f\"links_{data_name_lower}\"):\n                    key = None\n                    keys.append(key)\n                else:\n                    links = getattr(label, f\"links_{data_name_lower}\").filter(\n                        **{f\"{data_name_lower}_id\": data.id}\n                    )\n                    for link in links:\n                        if link.feature is not None:\n                            features.add(link.feature)\n                            key = link.feature.uid\n                        else:\n                            key = None\n                        keys.append(key)\n                label_returned = transfer_to_default_db(\n                    label,\n                    using_key,\n                    transfer_logs=transfer_logs,\n                    transfer_fk=False,\n                    save=True,\n                )\n                # TODO: refactor return value of transfer to default db\n                if label_returned is not None:\n                    label = label_returned\n                for key in keys:\n                    labels_by_features[key].append(label)\n            # treat features\n            new_features = save_validated_records(list(features))\n            if len(new_features) > 0:\n                transfer_fk_to_default_db_bulk(\n                    new_features, using_key, transfer_logs=transfer_logs\n                )\n                for feature in new_features:\n                    transfer_to_default_db(\n                        feature,  # type: ignore\n                        using_key,\n                        transfer_logs=transfer_logs,\n                        transfer_fk=False,\n                    )\n                save(new_features)  # type: ignore\n            if hasattr(self._host, related_name):\n                for feature_uid, feature_labels in labels_by_features.items():\n                    if feature_uid is not None:\n                        feature_id = Feature.get(feature_uid).id\n                    else:\n                        feature_id = None\n                    getattr(self._host, related_name).add(\n                        *feature_labels, through_defaults={\"feature_id\": feature_id}\n                    )\n"
  },
  {
    "path": "lamindb/models/_relations.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nimport lamindb_setup as ln_setup\nfrom django.db.models import ManyToManyField\nfrom lamindb_setup._connect_instance import (\n    get_owner_name_from_identifier,\n    load_instance_settings,\n)\nfrom lamindb_setup.core._settings_store import instance_settings_file\n\nfrom lamindb.models.sqlrecord import IsLink\n\nif TYPE_CHECKING:\n    from lamindb.models.sqlrecord import Registry, SQLRecord\n\n\ndef get_schema_modules(instance: str | None) -> set[str]:\n    if instance is None or instance == \"default\":\n        schema_modules = set(ln_setup.settings.instance.modules)\n        schema_modules.add(\"core\")\n        return schema_modules\n    owner, name = get_owner_name_from_identifier(instance)\n    settings_file = instance_settings_file(name, owner)\n    if settings_file.exists():\n        modules = set(load_instance_settings(settings_file).modules)\n    else:\n        cache_filepath = (\n            ln_setup.settings.cache_dir / f\"instance--{owner}--{name}--uid.txt\"\n        )\n        if cache_filepath.exists():\n            modules = set(cache_filepath.read_text().split(\"\\n\")[1].split(\",\"))\n        else:\n            raise ValueError(f\"Instance {instance} not found\")\n    shared_schema_modules = set(ln_setup.settings.instance.modules).intersection(\n        modules\n    )\n    shared_schema_modules.add(\"core\")\n    return shared_schema_modules\n\n\n# this function here should likely be renamed\n# it maps the __get_name_with_module__() onto the actual model\ndef dict_module_name_to_model_name(\n    registry: Registry, instance: str | None = None\n) -> dict[str, Registry]:\n    schema_modules = get_schema_modules(instance)\n    d: dict = {\n        i.related_model.__get_name_with_module__(): i.related_model\n        for i in registry._meta.related_objects\n        if i.related_name is not None\n        and i.related_model.__get_module_name__() in schema_modules\n    }\n    d.update(\n        {\n            i.related_model.__get_name_with_module__(): i.related_model\n            for i in registry._meta.many_to_many\n            if i.name is not None\n            and i.related_model.__get_module_name__() in schema_modules\n        }\n    )\n    return d\n\n\ndef dict_related_model_to_related_name(\n    registry: type[SQLRecord], links: bool = False, instance: str | None = None\n) -> dict[str, str]:\n    def include(model: SQLRecord):\n        return not links != issubclass(model, IsLink)\n\n    schema_modules = get_schema_modules(instance)\n\n    related_objects = registry._meta.related_objects + registry._meta.many_to_many\n    d: dict = {\n        record.related_model.__get_name_with_module__(): (\n            record.related_name\n            if not isinstance(record, ManyToManyField)\n            else record.name\n        )\n        for record in related_objects\n        if (\n            record.name is not None\n            and include(record.related_model)\n            and record.related_model.__get_module_name__() in schema_modules\n            and not (\n                (\n                    record.related_name\n                    if not isinstance(record, ManyToManyField)\n                    else record.name\n                ).startswith(\"linked_in_\")\n            )\n        )\n    }\n    if \"RecordRecord\" in d:\n        d[\"RecordRecord\"] = \"values_record\"\n    return d\n\n\ndef get_related_name(features_type: type[SQLRecord]) -> str:\n    from lamindb.models.schema import Schema\n\n    candidates = [\n        field.related_name\n        for field in Schema._meta.related_objects\n        if field.related_model == features_type\n    ]\n    if not candidates:\n        raise ValueError(\n            f\"Can't create feature sets from {features_type.__name__} because it's not\"\n            \" related to it!\\nYou need to create a link model between Schema and\"\n            \" your SQLRecord in your custom module.\\nTo do so, add a\"\n            \" line:\\n_schemas = models.ManyToMany(Schema,\"\n            \" related_name='mythings')\\n\"\n        )\n    return candidates[0]\n"
  },
  {
    "path": "lamindb/models/_run_cleanup.py",
    "content": "\"\"\"Background cleanup of report/environment artifacts after Run bulk delete.\n\nRunnable as: python -m lamindb.models._run_cleanup --instance owner/name --ids 1,2,3 [--run-uid UID]\n\"\"\"\n\nimport argparse\nimport logging\n\nfrom lamin_utils import logger\n\nimport lamindb as ln\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser(description=\"Clean up orphaned run artifacts.\")\n    parser.add_argument(\"--instance\", required=True, help=\"Instance slug (owner/name).\")\n    parser.add_argument(\"--ids\", required=True, help=\"Comma-separated artifact IDs.\")\n    parser.add_argument(\n        \"--run-uid\",\n        required=True,\n        help=\"Run UID for log file name (run_cleanup_logs_{uid}.txt in cache dir).\",\n    )\n    args = parser.parse_args()\n\n    ln.connect(args.instance)\n\n    file_handler = None\n    log_path = ln.setup.settings.cache_dir / f\"run_cleanup_logs_{args.run_uid}.txt\"\n    file_handler = logging.FileHandler(log_path, mode=\"a\")\n    logger.addHandler(file_handler)\n\n    for aid_str in args.ids.split(\",\"):\n        aid = int(aid_str.strip())\n        artifact = ln.Artifact.objects.filter(id=aid).first()\n        if artifact is not None:\n            assert artifact.kind == \"__lamindb_run__\", (\n                f\"artifact {artifact.uid} is not of __lamindb_run__ kind, aborting cleanup of artifacts {args.ids}\"\n            )\n            try:\n                artifact.delete(permanent=True)\n                logger.important(f\"deleted artifact {aid}\")\n            except Exception as e:\n                logger.error(f\"did not delete artifact {aid}: {e}\")\n                pass\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "lamindb/models/artifact.py",
    "content": "# ruff: noqa: TC004\nfrom __future__ import annotations\n\nimport shutil\nimport types\nimport warnings\nfrom collections import defaultdict\nfrom pathlib import Path, PurePath, PurePosixPath\nfrom typing import TYPE_CHECKING, Any, Iterator, Literal, TypeVar, Union, overload\n\nimport fsspec\nimport lamindb_setup as ln_setup\nfrom django.db import ProgrammingError, models\nfrom django.db.models import CASCADE, PROTECT, Q\nfrom django.db.models.functions import Length\nfrom lamin_utils import colors, logger\nfrom lamindb_setup import settings as setup_settings\nfrom lamindb_setup.core._hub_core import select_storage_or_parent\nfrom lamindb_setup.core.hashing import HASH_LENGTH, hash_dir, hash_file\nfrom lamindb_setup.core.upath import (\n    LocalPathClasses,\n    UPath,\n    create_path,\n    extract_suffix_from_path,\n    fs_for_moving,\n    get_stat_dir_cloud,\n    get_stat_file_cloud,\n)\n\nfrom ..base.fields import (\n    BigIntegerField,\n    BooleanField,\n    CharField,\n    ForeignKey,\n    TextField,\n)\nfrom ..base.users import current_user_id\nfrom ..base.utils import deprecated, strict_classmethod\nfrom ..core._compat import with_package_obj\nfrom ..core._settings import settings\nfrom ..errors import (\n    FieldValidationError,\n    InvalidArgument,\n    NoStorageLocationForSpace,\n    NoWriteAccess,\n    UnknownStorageLocation,\n    ValidationError,\n)\nfrom ._feature_manager import (\n    FeatureManager,\n    get_label_links,\n)\nfrom ._is_versioned import (\n    IsVersioned,\n    create_uid,\n)\nfrom ._relations import (\n    dict_module_name_to_model_name,\n    dict_related_model_to_related_name,\n)\nfrom .feature import Feature, JsonValue\nfrom .has_parents import view_lineage\nfrom .query_set import QuerySet, SQLRecordList\nfrom .run import Run, TracksRun, TracksUpdates, User\nfrom .save import check_and_attempt_clearing, check_and_attempt_upload\nfrom .schema import Schema\nfrom .sqlrecord import (\n    BaseSQLRecord,\n    Branch,\n    IsLink,\n    Space,\n    SQLRecord,\n    _get_record_kwargs,\n)\nfrom .storage import Storage\nfrom .ulabel import ULabel\n\n\ndef _lazy_load_storage_module():\n    \"\"\"Lazy-import storage to avoid loading pandas/anndata at package import.\"\"\"\n    from ..core.storage import (\n        delete_storage,\n        infer_suffix,\n        write_to_disk,\n    )\n    from ..core.storage.paths import (\n        AUTO_KEY_PREFIX,\n        auto_storage_key_from_artifact,\n        auto_storage_key_from_artifact_uid,\n        check_path_is_child_of_root,\n        filepath_cache_key_from_artifact,\n        filepath_from_artifact,\n    )\n\n    return types.SimpleNamespace(\n        delete_storage=delete_storage,\n        infer_suffix=infer_suffix,\n        write_to_disk=write_to_disk,\n        AUTO_KEY_PREFIX=AUTO_KEY_PREFIX,\n        auto_storage_key_from_artifact=auto_storage_key_from_artifact,\n        auto_storage_key_from_artifact_uid=auto_storage_key_from_artifact_uid,\n        check_path_is_child_of_root=check_path_is_child_of_root,\n        filepath_cache_key_from_artifact=filepath_cache_key_from_artifact,\n        filepath_from_artifact=filepath_from_artifact,\n    )\n\n\n# Cache the storage utils on first use\n_storage_cache: object | None = None\n\n\n# refactor this module to group logic that needs storage access in a class\n# in the future; then we don't need _s() anymore\ndef _s():\n    global _storage_cache\n    if _storage_cache is None:\n        _storage_cache = _lazy_load_storage_module()\n    return _storage_cache\n\n\nWARNING_RUN_TRANSFORM = \"no run & transform got linked, call `ln.track()` & re-run\"\n\nWARNING_NO_INPUT = \"run input wasn't tracked, call `ln.track()` and re-run\"\n\n\ndef _identify_zarr_type(storepath, *, check: bool = True):\n    \"\"\"Lazy-import to avoid loading storage at package import.\"\"\"\n    try:\n        from ..core.storage._zarr import identify_zarr_type\n\n        return identify_zarr_type(storepath, check=check)\n    except ImportError:\n        raise ImportError(\"Please install zarr: pip install 'lamindb[zarr]'\") from None\n\n\nif TYPE_CHECKING:\n    from collections.abc import Iterable\n\n    import pandas as pd\n    from anndata import AnnData\n    from fsspec import AbstractFileSystem\n    from lamindb_setup.types import AnyPathStr\n    from mudata import MuData  # noqa: TC004\n    from polars import LazyFrame as PolarsLazyFrame\n    from pyarrow.dataset import Dataset as PyArrowDataset\n    from spatialdata import SpatialData  # noqa: TC004\n    from tiledbsoma import Collection as SOMACollection\n    from tiledbsoma import Experiment as SOMAExperiment\n    from tiledbsoma import Measurement as SOMAMeasurement\n\n    from ..base.types import (\n        ArtifactKind,\n        StrField,\n    )\n    from ..core.storage._backed_access import (\n        AnnDataAccessor,\n        BackedAccessor,\n        SpatialDataAccessor,\n    )\n    from ..core.storage.types import ScverseDataStructures\n    from ._label_manager import LabelManager\n    from .block import ArtifactBlock\n    from .collection import Collection\n    from .project import Project, Reference\n    from .query_manager import RelatedManager\n    from .record import Record\n    from .transform import Transform\n\n\nOUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG = (\n    \"Cannot read this outdated artifact version: \"\n    \"its files were overwritten and are no longer available.\\n\"\n    \"Read from the latest version: artifact.versions.get(is_latest=True)\"\n)\n\n\ndef process_pathlike(\n    filepath: UPath,\n    storage: Storage,\n    using_key: str | None,\n    skip_existence_check: bool = False,\n) -> tuple[Storage, bool]:\n    \"\"\"Determines the appropriate storage for a given path and whether to use an existing storage key.\"\"\"\n    if not skip_existence_check:\n        try:  # check if file exists\n            if not filepath.exists():\n                raise FileNotFoundError(filepath)\n        except PermissionError:\n            pass\n    if _s().check_path_is_child_of_root(filepath, storage.root):\n        use_existing_storage_key = True\n        return storage, use_existing_storage_key\n    else:\n        # check whether the path is part of one of the existing\n        # already-registered storage locations\n        result = None\n        # within the hub, we don't want to perform check_path_in_existing_storage\n        if using_key is None:\n            result = check_path_in_existing_storage(\n                filepath, check_hub_register_storage=setup_settings.instance.is_on_hub\n            )\n        if isinstance(result, Storage):\n            use_existing_storage_key = True\n            return result, use_existing_storage_key\n        else:\n            # if the path is in the cloud, we have a good candidate\n            # for the storage root: the bucket\n            if not isinstance(filepath, LocalPathClasses):\n                # for a cloud path, new_root is always the bucket name\n                if filepath.protocol == \"hf\":\n                    hf_path = filepath.fs.resolve_path(filepath.as_posix())\n                    if hasattr(hf_path, \"root\"):\n                        new_root = \"hf://\" + hf_path.root\n                    else:\n                        hf_path.path_in_repo = \"\"\n                        new_root = \"hf://\" + hf_path.unresolve().rstrip(\"/\")\n                else:\n                    if filepath.protocol == \"s3\":\n                        # check that endpoint_url didn't propagate here\n                        # as a part of the path string\n                        assert \"?\" not in filepath.path  # noqa: S101\n                    new_root = list(filepath.parents)[-1].as_posix().rstrip(\"/\")\n                # Re the Parallel execution of the logic below:\n                # One of the threads (or processes) would start to write the hub record and then the test file.\n                # The other ones would retrieve the hub record and the test file.\n                # All of them would come out of the exercise with storage_record.instance_uid == setup_settings.instance.uid\n                # and all of them would raise UnkownStorageLocation.\n                # Then one of these threads will trigger storage_record.delete() but also this is idempotent;\n                # this means they all throw the same error and deletion of the inexistent stuff (hub record, marker file)\n                # would just silently fail.\n                # Edge case: A user legitimately creates a storage location and another user runs this here at the exact same time.\n                # There is no way to decide then which is the legitimate creation.\n                storage_record = Storage(root=new_root).save()\n                if storage_record.instance_uid == setup_settings.instance.uid:\n                    # we don't want to inadvertently create managed storage locations\n                    # hence, we revert the creation and throw an error\n                    storage_record.delete()\n                    raise UnknownStorageLocation(\n                        f\"Path {filepath} is not contained in any known storage location:\\n{Storage.to_dataframe()[['uid', 'root', 'type']]}\\n\\n\"\n                        f\"Create a managed storage location that contains the path, e.g., by calling: ln.Storage(root='{new_root}').save()\"\n                    )\n                use_existing_storage_key = True\n                return storage_record, use_existing_storage_key\n            # if the filepath is local\n            else:\n                use_existing_storage_key = False\n                # if the default storage is local we'll throw an error if the user\n                # doesn't provide a key\n                if storage.type == \"local\":\n                    return storage, use_existing_storage_key\n                # if the default storage is in the cloud (the file is going to\n                # be uploaded upon saving it), we treat the filepath as a cache\n                else:\n                    return storage, use_existing_storage_key\n\n\ndef process_data(\n    provisional_uid: str,\n    data: AnyPathStr | pd.DataFrame | AnnData,\n    format: str | None,\n    key: str | None,\n    storage: Storage,\n    using_key: str | None,\n    skip_existence_check: bool = False,\n    is_replace: bool = False,\n    to_disk_kwargs: dict[str, Any] | None = None,\n) -> tuple[Any, Path | UPath, str, Storage, bool]:\n    \"\"\"Serialize a data object that's provided as file or in memory.\n\n    if not overwritten, data gets stored in default storage\n    \"\"\"\n    if with_package_obj(data, \"AnnData\", \"anndata\", lambda obj: True)[0]:\n        is_anndata = True\n        is_pathlike = False\n    elif isinstance(data, (str, Path, UPath)):\n        is_anndata = False\n        is_pathlike = True\n    else:\n        is_anndata = False\n        is_pathlike = False\n\n    if key is not None:\n        key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name=\"key\")\n        # use suffix as the (adata) format if the format is not provided\n        if is_anndata and format is None and len(key_suffix) > 0:\n            format = key_suffix[1:]\n    else:\n        key_suffix = None\n\n    if is_pathlike:\n        access_token = (\n            storage._access_token if hasattr(storage, \"_access_token\") else None\n        )\n        path = create_path(data, access_token=access_token)\n        # we don't resolve http links because they can resolve into a different domain\n        # for example into a temporary url\n        if path.protocol not in {\"http\", \"https\"}:\n            path = path.resolve()\n\n        storage, use_existing_storage_key = process_pathlike(\n            path,\n            storage=storage,\n            using_key=using_key,\n            skip_existence_check=skip_existence_check,\n        )\n        suffix = extract_suffix_from_path(path)\n        memory_rep = None\n    elif (\n        is_anndata\n        or data_is_dataframe(data)\n        or data_is_scversedatastructure(data, \"MuData\")\n        or data_is_scversedatastructure(data, \"SpatialData\")\n    ):\n        storage = storage\n        memory_rep = data\n        suffix = _s().infer_suffix(data, format)\n    else:\n        raise NotImplementedError(\n            f\"Do not know how to create an Artifact from {data}, pass a path instead.\"\n        )\n\n    # Check for suffix consistency\n    if key_suffix is not None and key_suffix != suffix and not is_replace:\n        # consciously omitting a trailing period\n        if is_pathlike:\n            message = f\"The passed path's suffix '{suffix}' must match the passed key's suffix '{key_suffix}'.\"\n        else:\n            message = f\"The passed key's suffix '{key_suffix}' must match the passed path's suffix '{suffix}'.\"\n        raise InvalidArgument(message)\n\n    # in case we have an in-memory representation, we need to write it to disk\n    if memory_rep is not None:\n        path = settings.cache_dir / f\"{provisional_uid}{suffix}\"\n        logger.info(\"writing the in-memory object into cache\")\n        if to_disk_kwargs is None:\n            to_disk_kwargs = {}\n        _s().write_to_disk(data, path, **to_disk_kwargs)\n        use_existing_storage_key = False\n\n    return memory_rep, path, suffix, storage, use_existing_storage_key\n\n\ndef get_stat_or_artifact(\n    path: UPath,\n    storage: Record,\n    key: str | None = None,\n    check_hash: bool = True,\n    is_replace: bool = False,\n    instance: str | None = None,\n    skip_hash_lookup: bool = False,\n) -> Union[tuple[int, str | None, str | None, int | None, Artifact | None], Artifact]:\n    \"\"\"Retrieves file statistics or an existing artifact based on the path, hash, and key.\"\"\"\n    n_files = None\n    if settings.creation.artifact_skip_size_hash:\n        return None, None, None, n_files, None\n    stat = path.stat()  # one network request\n    if not isinstance(path, LocalPathClasses):\n        size, hash, hash_type = None, None, None\n        if stat is not None:\n            # convert UPathStatResult to fsspec info dict\n            stat = stat.as_info()\n            if (store_type := stat[\"type\"]) == \"file\":\n                size, hash, hash_type = get_stat_file_cloud(stat)\n            elif store_type == \"directory\":\n                size, hash, hash_type, n_files = get_stat_dir_cloud(path)\n        if hash is None:\n            logger.warning(f\"did not add hash for {path}\")\n            return size, hash, hash_type, n_files, None\n    else:\n        if path.is_dir():\n            size, hash, hash_type, n_files = hash_dir(path)\n        else:\n            size, hash, hash_type = hash_file(path)\n    if not check_hash:\n        return size, hash, hash_type, n_files, None\n    # Empty files all share the same content hash; skip cross-artifact hash\n    # lookup so creating a new empty file path yields a new artifact.\n    if n_files is None and size == 0:\n        skip_hash_lookup = True\n    previous_artifact_version = None\n    artifacts_qs = Artifact.objects.using(instance)\n    if skip_hash_lookup:\n        artifact_with_same_hash_exists = False\n        if key is not None and not is_replace:\n            # only search for a previous version of the artifact\n            # ignoring hash\n            queryset_same_hash_or_same_key = artifacts_qs.filter(\n                ~Q(branch_id=-1),\n                key=key,\n                storage=storage,\n            ).order_by(\"-created_at\")\n        else:\n            queryset_same_hash_or_same_key = []\n    else:\n        # this purposefully leaves out the storage location and key that we have\n        # in the hard database unique constraints\n        # so that the user is able to find artifacts with the same hash across\n        # storage locations and keys\n        # if this is not desired, set skip_hash_lookup=True\n        if key is None or is_replace:\n            queryset_same_hash = artifacts_qs.filter(~Q(branch_id=-1), hash=hash)\n            artifact_with_same_hash_exists = queryset_same_hash.count() > 0\n        else:\n            # the following query achieves one more thing beyond hash lookup\n            # it allows us to find a previous version of the artifact based on\n            # matching key & storage even if the hash is different\n            # we do this here so that we don't have to do an additional query later\n            # see the `previous_artifact_version` variable below\n            queryset_same_hash_or_same_key = artifacts_qs.filter(\n                ~Q(branch_id=-1),\n                Q(hash=hash) | Q(key=key, storage=storage),\n            ).order_by(\"-created_at\")\n            queryset_same_hash = queryset_same_hash_or_same_key.filter(hash=hash)\n            artifact_with_same_hash_exists = queryset_same_hash.count() > 0\n    if key is not None and not is_replace:\n        if (\n            not artifact_with_same_hash_exists\n            and queryset_same_hash_or_same_key.count() > 0\n        ):\n            logger.important(\n                f\"creating new artifact version for key '{key}' in storage '{storage.root}'\"\n            )\n            previous_artifact_version = queryset_same_hash_or_same_key[0]\n    if artifact_with_same_hash_exists:\n        artifact_with_same_hash = queryset_same_hash[0]\n        logger.important(\n            f\"returning artifact with same hash: {artifact_with_same_hash}; to track this artifact as an input, use: ln.Artifact.get()\"\n        )\n        return artifact_with_same_hash\n    else:\n        return size, hash, hash_type, n_files, previous_artifact_version\n\n\ndef check_path_in_existing_storage(\n    path: Path | UPath,\n    check_hub_register_storage: bool = False,\n    using_key: str | None = None,\n) -> Storage | None:\n    for storage in Storage.objects.using(using_key).order_by(Length(\"root\").desc()):\n        # if path is part of storage, return it\n        if _s().check_path_is_child_of_root(path, root=storage.root):\n            return storage\n    # we don't see parents registered in the db, so checking the hub\n    # just check for 2 writable cloud protocols, maybe change in the future\n    if check_hub_register_storage and getattr(path, \"protocol\", None) in {\"s3\", \"gs\"}:\n        result = select_storage_or_parent(path.as_posix())\n        if result is not None:\n            return Storage(**result, _skip_preparation=True).save()\n    return None\n\n\ndef get_relative_path_to_directory(\n    path: PurePath | Path | UPath, directory: PurePath | Path | UPath\n) -> PurePath | Path | UPath:\n    if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):\n        # this is safer for cloud paths such as http paths\n        relpath = PurePath(\n            path.as_posix().replace(directory.as_posix(), \"\").lstrip(\"/\")\n        )\n    elif isinstance(directory, LocalPathClasses):\n        relpath = path.resolve().relative_to(directory.resolve())  # type: ignore\n    elif isinstance(directory, PurePath):\n        relpath = path.relative_to(directory)\n    else:\n        raise TypeError(\"Directory not of type Path or UPath\")\n    return relpath\n\n\ndef get_artifact_kwargs_from_data(\n    *,\n    data: Path | UPath | str | pd.DataFrame | ScverseDataStructures,\n    key: str | None,\n    run: Run | None,\n    format: str | None,\n    provisional_uid: str,\n    version_tag: str | None,\n    storage: Storage,\n    using_key: str | None = None,\n    is_replace: bool = False,\n    skip_check_exists: bool = False,\n    overwrite_versions: bool | None = None,\n    skip_hash_lookup: bool = False,\n    to_disk_kwargs: dict[str, Any] | None = None,\n    key_is_virtual: bool | None = None,\n):\n    memory_rep, path, suffix, storage, use_existing_storage_key = process_data(\n        provisional_uid,\n        data,\n        format,\n        key,\n        storage,\n        using_key,\n        skip_check_exists,\n        is_replace=is_replace,\n        to_disk_kwargs=to_disk_kwargs,\n    )\n\n    check_path_in_storage = False\n    real_key = None\n    if use_existing_storage_key:\n        inferred_key = get_relative_path_to_directory(\n            path=path, directory=UPath(storage.root)\n        ).as_posix()\n        if key is None:\n            key = inferred_key\n        elif key != inferred_key:\n            real_key = inferred_key\n        check_path_in_storage = True\n    else:\n        storage = storage\n    stat_or_artifact = get_stat_or_artifact(\n        path=path,\n        storage=storage,\n        key=key,\n        instance=using_key,\n        is_replace=is_replace,\n        skip_hash_lookup=skip_hash_lookup,\n    )\n    if not isinstance(path, LocalPathClasses):\n        local_filepath = None\n        cloud_filepath = path\n    else:\n        local_filepath = path\n        cloud_filepath = None\n    privates = {\n        \"local_filepath\": local_filepath,\n        \"cloud_filepath\": cloud_filepath,\n        \"memory_rep\": memory_rep,\n        \"check_path_in_storage\": check_path_in_storage,\n    }\n    if isinstance(stat_or_artifact, Artifact):\n        existing_artifact = stat_or_artifact\n        # if the artifact was unsuccessfully saved, we want to\n        # enable re-uploading after returning the artifact object\n        # the upload is triggered by whether the privates are returned\n        if existing_artifact._storage_ongoing:\n            privates[\"key\"] = key\n            returned_privates = privates  # re-upload necessary\n        else:\n            returned_privates = {\"key\": key}\n        returned_privates[\"is_artifact_storage_managed_by_current_instance\"] = (\n            existing_artifact.storage.instance_uid == setup_settings.instance.uid\n        )\n        return existing_artifact, returned_privates\n    else:\n        size, hash, hash_type, n_files, revises = stat_or_artifact\n\n    # update local path\n    if revises is not None:  # update provisional_uid\n        provisional_uid, revises = create_uid(revises=revises, version_tag=version_tag)\n        if settings.cache_dir in path.parents:\n            path = path.rename(path.with_name(f\"{provisional_uid}{suffix}\"))\n            privates[\"local_filepath\"] = path\n\n    log_storage_hint(\n        check_path_in_storage=check_path_in_storage,\n        storage=storage,\n        key=key,\n        uid=provisional_uid,\n        suffix=suffix,\n        is_dir=n_files is not None,\n    )\n\n    if overwrite_versions is None:\n        overwrite_versions = n_files is not None\n\n    if check_path_in_storage:\n        # True here means that we have a path in an existing storage with a virtual key\n        real_key_is_set = real_key is not None\n        if key_is_virtual is not None and key_is_virtual != real_key_is_set:\n            raise ValueError(\n                f\"Passing a path in an existing storage {'with' if real_key_is_set else 'without'} \"\n                f\"a virtual key and _key_is_virtual={key_is_virtual} is incompatible.\"\n            )\n        # we use an actual storage key if key is not provided explicitly\n        set_key_is_virtual = real_key_is_set\n    else:\n        # do we use a virtual or an actual storage key?\n        set_key_is_virtual = (\n            settings.creation._artifact_use_virtual_keys\n            if key_is_virtual is None\n            else key_is_virtual\n        )\n\n    # needed to check if the artifact storage is managed by the current instance on artifact init\n    privates[\"is_artifact_storage_managed_by_current_instance\"] = (\n        storage.instance_uid == setup_settings.instance.uid\n    )\n\n    kwargs = {\n        \"uid\": provisional_uid,\n        \"suffix\": suffix,\n        \"hash\": hash,\n        \"_hash_type\": hash_type,\n        \"key\": key,\n        \"size\": size,\n        \"storage_id\": storage.id,\n        \"n_files\": n_files,\n        \"_overwrite_versions\": overwrite_versions,  # True for folder, False for file\n        \"n_observations\": None,  # to implement\n        \"run_id\": run.id if run is not None else None,\n        \"run\": run,\n        \"_key_is_virtual\": set_key_is_virtual,\n        \"revises\": revises,\n        \"_real_key\": real_key,\n    }\n    return kwargs, privates\n\n\ndef log_storage_hint(\n    *,\n    check_path_in_storage: bool,\n    storage: Storage | None,\n    key: str | None,\n    uid: str,\n    suffix: str,\n    is_dir: bool,\n) -> None:\n    hint = \"\"\n    if check_path_in_storage:\n        display_root = storage.root  # type: ignore\n        # check whether path is local\n        if fsspec.utils.get_protocol(storage.root) == \"file\":  # type: ignore\n            # if it's a local path, check whether it's in the current working directory\n            root_path = Path(storage.root)  # type: ignore\n            if _s().check_path_is_child_of_root(root_path, Path.cwd()):\n                # only display the relative path, not the fully resolved path\n                display_root = root_path.relative_to(Path.cwd())  # type: ignore\n        hint += f\"path in storage '{display_root}'\"  # type: ignore\n    else:\n        hint += \"path content will be copied to default storage upon `save()`\"\n    if key is None:\n        storage_key = _s().auto_storage_key_from_artifact_uid(uid, suffix, is_dir)\n        hint += f\" with key `None` ('{storage_key}')\"\n    else:\n        hint += f\" with key '{key}'\"\n    logger.hint(hint)\n\n\ndef data_is_dataframe(data: Any) -> bool:\n    # TODO: maybe check also for pandas.DataFrame subclasses,\n    # but in this case also infer_suffix should be updated\n    return with_package_obj(data, \"DataFrame\", \"pandas\", lambda obj: True)[0]\n\n\ndef data_is_scversedatastructure(\n    data: ScverseDataStructures | AnyPathStr,\n    structure_type: Literal[\"AnnData\", \"MuData\", \"SpatialData\"] | None = None,\n    cloud_warning: bool = True,\n) -> bool:\n    \"\"\"Determine whether a specific in-memory object or a path is any or a specific scverse data structure.\"\"\"\n    file_suffix = None\n    if structure_type == \"AnnData\":\n        file_suffix = \".h5ad\"\n    elif structure_type == \"MuData\":\n        file_suffix = \".h5mu\"\n    # SpatialData does not have a unique suffix but `.zarr`\n\n    # AnnData allows both AnnDataAccessor and AnnData\n    class_name = data.__class__.__name__\n    if structure_type is None:\n        return any(\n            class_name\n            in ([\"AnnData\", \"AnnDataAccessor\"] if cl_name == \"AnnData\" else [cl_name])\n            for cl_name in [\"AnnData\", \"MuData\", \"SpatialData\"]\n        )\n    elif class_name in (\n        [\"AnnData\", \"AnnDataAccessor\"]\n        if structure_type == \"AnnData\"\n        else [structure_type]\n    ):\n        return True\n\n    data_type = structure_type.lower()\n    if isinstance(data, (str, Path, UPath)):\n        data_path = UPath(data)\n\n        if file_suffix in data_path.suffixes:\n            return True\n\n        if data_path.suffix == \".zarr\":\n            type_suffix = f\".{data_type}\"\n            if type_suffix in data_path.suffixes:\n                return True\n\n            # check only for local, expensive for cloud\n            if fsspec.utils.get_protocol(data_path.as_posix()) == \"file\":\n                return (\n                    _identify_zarr_type(\n                        data_path if structure_type == \"AnnData\" else data,\n                        check=True if structure_type == \"AnnData\" else False,\n                    )\n                    == data_type\n                )\n            elif cloud_warning:\n                logger.warning(\n                    f\"we do not check whether cloud zarr is {structure_type}\"\n                )\n                return False\n\n    return False\n\n\ndef data_is_soma_experiment(data: SOMAExperiment | AnyPathStr) -> bool:\n    # We are not importing tiledbsoma here to keep loaded modules minimal\n    if hasattr(data, \"__class__\") and data.__class__.__name__ == \"Experiment\":\n        return True\n    if isinstance(data, (str, Path, UPath)):\n        return UPath(data).suffix == \".tiledbsoma\"\n    return False\n\n\ndef check_otype_artifact(\n    data: AnyPathStr | pd.DataFrame | ScverseDataStructures,\n    otype: str | None = None,\n    cloud_warning: bool = True,\n) -> str:\n    if otype is not None:\n        return otype\n\n    if isinstance(data, (str, Path, UPath)):\n        is_pathlike = True\n        suffix = UPath(data).suffix\n    else:\n        is_pathlike = False\n        suffix = None\n\n    if (is_pathlike and suffix in {\".parquet\", \".csv\", \".ipc\"}) or data_is_dataframe(\n        data\n    ):\n        logger.warning(\"data is a DataFrame, please use .from_dataframe()\")\n        otype = \"DataFrame\"\n        return otype\n    if data_is_scversedatastructure(data, \"AnnData\", cloud_warning):\n        if not is_pathlike:\n            logger.warning(\"data is an AnnData, please use .from_anndata()\")\n        otype = \"AnnData\"\n    elif data_is_scversedatastructure(data, \"MuData\", cloud_warning):\n        if not is_pathlike:\n            logger.warning(\"data is a MuData, please use .from_mudata()\")\n        otype = \"MuData\"\n    elif data_is_scversedatastructure(data, \"SpatialData\", cloud_warning):\n        if not is_pathlike:\n            logger.warning(\"data is a SpatialData, please use .from_spatialdata()\")\n        otype = \"SpatialData\"\n    elif not is_pathlike:\n        raise TypeError(\"data has to be a string, Path, UPath\")\n    return otype\n\n\ndef populate_subsequent_run(record: Artifact | Collection, run: Run | None) -> None:\n    if run is None:\n        return\n    if record.run is None:\n        record.run = run\n    elif record.run != run:\n        record.recreating_runs.add(run)\n        record._subsequent_run_id = run.id\n\n\n# also see current_run() in core._data\ndef get_run(run: Run | None) -> Run | None:\n    from ..core._context import context\n    from ..core._functions import get_current_tracked_run\n\n    if run is None:\n        run = get_current_tracked_run()\n        if run is None:\n            run = context.run\n        if run is None and not settings.creation.artifact_silence_missing_run_warning:\n            isettings = setup_settings.instance\n            if not (isettings._is_clone or isettings.is_read_only_connection):\n                logger.warning(WARNING_RUN_TRANSFORM)\n    # suppress run by passing False\n    elif not run:\n        run = None\n    return run\n\n\ndef save_staged_schemas(self: Artifact) -> None:\n    if hasattr(self, \"_staged_schemas\"):\n        from lamindb.models._feature_manager import get_schema_by_slot_\n\n        existing_staged_schemas = get_schema_by_slot_(self)\n        saved_staged_schemas = {}\n        for key, schema in self._staged_schemas.items():\n            if isinstance(schema, Schema) and schema._state.adding:\n                schema.save()\n                saved_staged_schemas[key] = schema\n            if key in existing_staged_schemas:\n                # remove existing feature set on the same slot\n                self.schemas.remove(existing_staged_schemas[key])\n        if len(saved_staged_schemas) > 0:\n            s = \"s\" if len(saved_staged_schemas) > 1 else \"\"\n            display_schema_keys = \",\".join(\n                f\"'{key}'\" for key in saved_staged_schemas.keys()\n            )\n            logger.save(\n                f\"saved {len(saved_staged_schemas)} feature set{s} for slot{s}:\"\n                f\" {display_schema_keys}\"\n            )\n\n\ndef save_schema_links(self: Artifact) -> None:\n    from lamindb.models.save import bulk_create\n\n    if hasattr(self, \"_staged_schemas\"):\n        links = []\n        for slot, schema in self._staged_schemas.items():\n            kwargs = {\n                \"artifact_id\": self.id,\n                \"schema_id\": schema.id,\n                \"slot\": slot,\n            }\n            links.append(Artifact.schemas.through(**kwargs))\n        bulk_create(links, ignore_conflicts=True)\n\n\ndef validate_feature(feature: Feature, records: list[SQLRecord]) -> None:\n    \"\"\"Validate feature record, adjust feature.dtype based on labels records.\"\"\"\n    if not isinstance(feature, Feature):\n        raise TypeError(\"feature has to be of type Feature\")\n    if feature._state.adding:\n        registries = {record.__class__.__get_name_with_module__() for record in records}\n        registries_str = \"|\".join(registries)\n        msg = f\"ln.Feature(name='{feature.name}', type='cat[{registries_str}]').save()\"\n        raise ValidationError(f\"Feature not validated. If it looks correct: {msg}\")\n\n\ndef get_labels(\n    self,\n    feature: Feature,\n    mute: bool = False,\n    flat_names: bool = False,\n) -> QuerySet | dict[str, QuerySet] | list:\n    \"\"\"{}\"\"\"  # noqa: D415\n    from .record import Record\n\n    if not isinstance(feature, Feature):\n        raise TypeError(\"feature has to be of type Feature\")\n    dtype_str = feature._dtype_str\n    if dtype_str is None or not dtype_str.startswith(\"cat[\"):\n        raise ValueError(\"feature does not have linked labels\")\n    registries_to_check = dtype_str.replace(\"cat[\", \"\").rstrip(\"]\").split(\"|\")\n    if len(registries_to_check) > 1 and not mute:\n        logger.warning(\"labels come from multiple registries!\")\n    # return an empty query set if self.id is still None\n    if self.id is None:\n        return QuerySet(self.__class__)\n    qs_by_registry = {}\n    for registry in registries_to_check:\n        # currently need to distinguish between ULabel and non-ULabel, because\n        # we only have the feature information for Label\n        if registry in {\"ULabel\", \"Record\"}:\n            links_to_labels = get_label_links(self, registry, feature)\n            label_ids = [\n                (link.ulabel_id if registry == \"ULabel\" else link.record_id)\n                for link in links_to_labels\n            ]\n            model = ULabel if registry == \"ULabel\" else Record\n            qs_by_registry[registry] = model.objects.using(self._state.db).filter(\n                id__in=label_ids\n            )\n        elif registry in self.features._accessor_by_registry:\n            qs_by_registry[registry] = getattr(\n                self, self.features._accessor_by_registry[registry]\n            ).all()\n    if flat_names:\n        # returns a flat list of names\n        from .sqlrecord import get_name_field\n\n        values = []\n        for v in qs_by_registry.values():\n            values += v.to_list(get_name_field(v))\n        return values\n    if len(registries_to_check) == 1 and registry in qs_by_registry:\n        return qs_by_registry[registry]\n    else:\n        return qs_by_registry\n\n\ndef add_labels(\n    self,\n    records: SQLRecord | list[SQLRecord] | QuerySet | Iterable,\n    feature: Feature | None = None,\n    *,\n    field: StrField | None = None,\n    from_curator: bool = False,\n) -> None:\n    \"\"\"{}\"\"\"  # noqa: D415\n    if self._state.adding:\n        raise ValueError(\"Please save the artifact/collection before adding a label!\")\n\n    if isinstance(records, (QuerySet, QuerySet.__base__)):  # need to have both\n        records = records.to_list()\n    if isinstance(records, (str, SQLRecord)):\n        records = [records]\n    if not isinstance(records, list):  # avoids warning for pd Series\n        records = list(records)\n    # create records from values\n    if len(records) == 0:\n        return None\n    if isinstance(records[0], str):  # type: ignore\n        records_validated = []\n        # feature is needed if we want to create records from values\n        if feature is None:\n            raise ValueError(\n                \"Please pass a feature, e.g., via: label = ln.ULabel(name='my_label',\"\n                \" feature=ln.Feature(name='my_feature'))\"\n            )\n        dtype_str = feature._dtype_str\n        if dtype_str.startswith(\"cat[\"):\n            orm_dict = dict_module_name_to_model_name(Artifact)\n            for reg in dtype_str.replace(\"cat[\", \"\").rstrip(\"]\").split(\"|\"):\n                registry = orm_dict.get(reg)\n                records_validated += registry.from_values(records, field=field)\n\n        # feature doesn't have registries and therefore can't create records from values\n        # ask users to pass records\n        if len(records_validated) == 0:\n            raise ValueError(\n                \"Please pass a record (a `SQLRecord` object), not a string, e.g., via:\"\n                \" label\"\n                f\" = ln.Record(name='{records[0]}')\"  # type: ignore\n            )\n        records = records_validated\n\n    for record in records:\n        if record._state.adding:\n            raise ValidationError(\n                f\"{record} not validated. If it looks correct: record.save()\"\n            )\n\n    if feature is None:\n        d = dict_related_model_to_related_name(self.__class__)\n        # strategy: group records by registry to reduce number of transactions\n        records_by_related_name: dict = {}\n        for record in records:\n            related_name = d.get(record.__class__.__get_name_with_module__())\n            if related_name is None:\n                raise ValueError(f\"Can't add labels to {record.__class__} record!\")\n            if related_name not in records_by_related_name:\n                records_by_related_name[related_name] = []\n            records_by_related_name[related_name].append(record)\n        for related_name, records in records_by_related_name.items():\n            getattr(self, related_name).add(*records)\n    else:\n        validate_feature(feature, records)  # type:ignore\n        records_by_registry = defaultdict(list)\n        schemas = self.schemas.filter(itype=\"Feature\")\n        internal_features = set()  # type: ignore\n        if len(schemas) > 0:\n            for schema in schemas:\n                internal_features = internal_features.union(\n                    set(schema.members.values_list(\"name\", flat=True))\n                )  # type: ignore\n        for record in records:\n            records_by_registry[record.__class__.__get_name_with_module__()].append(\n                record\n            )\n        for registry_name, records in records_by_registry.items():\n            if not from_curator and feature.name in internal_features:\n                raise ValidationError(\n                    \"Cannot manually annotate a feature measured *within* the dataset. Please use a Curator.\"\n                )\n            dtype_str = feature._dtype_str\n            if registry_name not in dtype_str:\n                if not dtype_str.startswith(\"cat\"):\n                    raise ValidationError(\n                        f\"Feature {feature.name} needs dtype='cat' for label annotation, currently has dtype='{dtype_str}'\"\n                    )\n                if registry_name not in dtype_str:\n                    new_dtype = dtype_str.rstrip(\"]\") + f\"|{registry_name}]\"\n                    raise ValidationError(\n                        f\"Label type {registry_name} is not valid for Feature(name='{feature.name}', dtype='{dtype_str}'), consider a feature with dtype='{new_dtype}'\"\n                    )\n            if registry_name not in self.features._accessor_by_registry:\n                logger.warning(f\"skipping {registry_name}\")\n                continue\n            if len(records) == 0:\n                continue\n            features_labels = {\n                registry_name: [(feature, label_record) for label_record in records]\n            }\n            self.features._add_label_feature_links(\n                features_labels,\n            )\n\n\ndef delete_permanently(artifact: Artifact, storage: bool | None, using_key: str):\n    # need to grab file path before deletion\n    try:\n        path, _ = _s().filepath_from_artifact(artifact, using_key)\n    except OSError:\n        # we can still delete the record\n        logger.warning(\"Could not get path\")\n        storage = False\n    # only delete in storage if DB delete is successful\n    # DB delete might error because of a foreign key constraint violated etc.\n    if artifact._overwrite_versions and artifact.is_latest:\n        logger.important(\n            \"deleting all versions of this artifact because they all share the same store\"\n        )\n        # artifact.versions pulls only versions that are not in trash\n        # this query set below contains all versions including those that are in trash\n        versions = Artifact.objects.using(artifact._state.db).filter(\n            uid__startswith=artifact.stem_uid\n        )\n        for version in versions:\n            _delete_skip_storage(version)\n    else:\n        artifact._delete_skip_storage()\n    # by default do not delete storage if deleting only a previous version\n    # and the underlying store is mutable\n    if artifact._overwrite_versions and not artifact.is_latest:\n        delete_in_storage = False\n        if storage:\n            logger.warning(\n                \"storage argument is ignored; can't delete store of a previous version if overwrite_versions is True\"\n            )\n    elif artifact.key is None or (\n        artifact._key_is_virtual and artifact._real_key is None\n    ):\n        # do not ask for confirmation also if storage is None\n        delete_in_storage = storage is None or storage\n    else:\n        # for artifacts with non-virtual semantic storage keys (key is not None)\n        # ask for extra-confirmation if storage is None\n        # the wording here is critical to avoid accidental deletions\n        if storage is None:\n            response = input(\n                f\"Artifact record deleted. Do you ALSO want to delete the data in storage at {path}? (y/n) You can't undo\"\n                \" this action.\"\n            )\n            delete_in_storage = response == \"y\"\n        else:\n            delete_in_storage = storage\n    if not delete_in_storage:\n        logger.important(f\"a file/folder remains here: {path}\")\n    # we don't yet have logic to bring back the deleted metadata record\n    # in case storage deletion fails - this is important for ACID down the road\n    if delete_in_storage:\n        delete_msg = _s().delete_storage(path, raise_file_not_found_error=False)\n        if delete_msg != \"did-not-delete\":\n            logger.success(f\"deleted {colors.yellow(f'{path}')}\")\n\n\nclass LazyArtifact:\n    \"\"\"Lazy artifact for streaming to auto-generated internal paths.\n\n    This is needed when it is desirable to stream to a `lamindb` auto-generated internal path\n    and register the path as an artifact (see :class:`~lamindb.Artifact`).\n\n    This object creates a real artifact on `.save()` with the provided arguments.\n\n    Args:\n        suffix: The suffix for the auto-generated internal path\n        overwrite_versions: Whether to overwrite versions.\n        **kwargs: Keyword arguments for the artifact to be created.\n\n    Examples:\n\n        Create a lazy artifact, write to the path and save to get a real artifact::\n\n            lazy = ln.Artifact.from_lazy(suffix=\".zarr\", overwrite_versions=True, key=\"mydata.zarr\")\n            zarr.open(lazy.path, mode=\"w\")[\"test\"] = np.array([\"test\"]) # stream to the path\n            artifact = lazy.save()\n    \"\"\"\n\n    def __init__(self, suffix: str, overwrite_versions: bool, **kwargs):\n        self.kwargs = kwargs\n        self.kwargs[\"overwrite_versions\"] = overwrite_versions\n\n        if (key := kwargs.get(\"key\")) is not None and extract_suffix_from_path(\n            PurePosixPath(key)\n        ) != suffix:\n            raise ValueError(\n                \"The suffix argument and the suffix of key should be the same.\"\n            )\n\n        uid, _ = create_uid(n_full_id=20)\n        storage_key = _s().auto_storage_key_from_artifact_uid(\n            uid, suffix, overwrite_versions=overwrite_versions\n        )\n        storepath = setup_settings.storage.root / storage_key\n\n        self._path = storepath\n\n    @property\n    def path(self) -> UPath:\n        return self._path\n\n    def save(self, upload: bool | None = None, **kwargs) -> Artifact:\n        artifact = Artifact(self.path, _is_internal_call=True, **self.kwargs)\n        return artifact.save(upload=upload, **kwargs)\n\n    def __repr__(self) -> str:  # pragma: no cover\n        show_kwargs = {k: v for k, v in self.kwargs.items() if v is not None}\n        return (\n            f\"LazyArtifact object with\\n path: {self.path}\\n arguments: {show_kwargs}\"\n        )\n\n\nT = TypeVar(\"T\", bound=BaseSQLRecord)\n\n\ndef _sqlrecord_or_id(\n    model: type[T],\n    sqlrecord: T | None,\n    sqlrecord_id: int | None,\n    check_type: bool = True,\n) -> T | None:\n    if sqlrecord is not None and sqlrecord_id is not None:\n        raise ValueError(\n            f\"Do not pass both {model.__name__} and its id at the same time.\"\n        )\n\n    if sqlrecord is None and sqlrecord_id is None:\n        return None\n    elif sqlrecord is not None:\n        assert not check_type or isinstance(sqlrecord, model), (\n            f\"Expected {model.__name__}, got {type(sqlrecord).__name__}.\"\n        )\n        return sqlrecord\n    elif sqlrecord_id is not None:\n        return model.objects.get(id=sqlrecord_id)\n\n\nclass Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):\n    \"\"\"Datasets & models stored as files, folders, or arrays.\n\n    Some artifacts are table- or array-like, e.g., when stored as `.parquet`, `.h5ad`, `.zarr`, or `.tiledb`.\n\n    Args:\n        path: `AnyPathStr` A path to a local or remote folder or file from which to create the artifact.\n        key: `str | None = None` A key within the storage location, e.g., `\"myfolder/myfile.fcs\"`. Artifacts with the same key form a version family.\n        description: `str | None = None` A description.\n        kind: `Literal[\"dataset\", \"model\"] | str | None = None` Distinguish models from datasets from other files & folders.\n        features: `dict | None = None` External features to annotate via :class:`~lamindb.models.FeatureManager.set_values`.\n        schema: `Schema | None = None` A schema to validate features.\n        revises: `Artifact | None = None` Previous version of the artifact. An alternative to passing `key` when creating a new version.\n        overwrite_versions: `bool | None = None` Whether to overwrite versions. Defaults to `True` for folders and `False` for files.\n        run: `Run | bool | None = None` The run that creates the artifact. If `False`, suppress tracking the run.\n            If `None`, infer the run from the global run context.\n        branch: `Branch | None = None` The branch of the artifact. If `None`, uses the current branch.\n        space: `Space | None = None` The space of the artifact. If `None`, uses the current space.\n        storage: `Storage | None = None` The storage location for the artifact. If `None`, uses the default (:attr:`~lamindb.core.Settings.storage`).\n        skip_hash_lookup: `bool = False` Skip the hash lookup so that a new artifact is created even if an artifact with the same hash already exists.\n            Empty files are always treated as if this were `True` because empty content hashes are not used for deduplication.\n\n    Examples:\n\n        Create an artifact **from a local file or folder**::\n\n            artifact = ln.Artifact(\"./my_file.parquet\", key=\"examples/my_file.parquet\").save()\n            artifact = ln.Artifact(\"./my_folder\", key=\"project1/my_folder\").save()\n\n        Calling `.save()` copies or uploads the file to the default storage location of your lamindb instance.\n        If you create an artifact **from a remote file or folder**, lamindb registers the S3 `key` and avoids copying the data::\n\n            artifact = ln.Artifact(\"s3://my_bucket/my_folder/my_file.csv\").save()  # can omit key/description because file is remote\n\n        If you then want to query & access the artifact later on, this is how you do it::\n\n            artifact = ln.Artifact.get(key=\"examples/my_file.parquet\")\n            cached_path = artifact.cache()  # sync to local cache & get local path\n\n        If the storage format supports it, you can load the artifact directly into memory or query it through a streaming interface, e.g., for parquet files::\n\n            df = artifact.load()               # load parquet file as DataFrame\n            pyarrow_dataset = artifact.open()  # open a streaming file-like object\n\n        To bulk-create artifacts for every file in a directory and **group them in a folder**, use :meth:`~lamindb.Artifact.from_dir`::\n\n            artifacts = ln.Artifact.from_dir(\"project_alpha/run_001\").save()  # create one artifact per file in the directory\n            artifacts = ln.Artifact.filter(key__startswith=\"project_alpha/run_001/\")  # query ingested artifacts via the folder prefix\n\n        To create a **versioned immutable collection** of artifacts for a data release, use :class:`~lamindb.Collection`::\n\n            collection = ln.Collection(artifacts, key=\"project_alpha/run_001\").save()\n\n        .. dropdown:: Virtual folders (key prefixes) vs. :class:`~lamindb.Collection` objects\n\n            - prefix query on `key`: If a colleague adds a new file to that prefix tomorrow, your `filter(key__startswith=...)` result will change.\n            - collection: A collection object provides a `uid` for every version and its content won't change.\n\n        If you want to **validate & annotate** a dataframe or an array using the feature & label registries,\n        pass `schema` to one of the `.from_dataframe()`, `.from_anndata()`, ... constructors::\n\n            artifact = ln.Artifact.from_dataframe(\n                \"./my_file.parquet\",\n                key=\"my_dataset.parquet\",\n                schema=\"valid_features\"\n            ).save()\n\n        To annotate by **external features**::\n\n            artifact = ln.Artifact(\"./my_file.parquet\", features={\"cell_type_by_model\": \"T cell\"}).save()\n\n        You can make a **new version** of an artifact by passing an existing `key`::\n\n            artifact_v2 = ln.Artifact(\"./my_file.parquet\", key=\"examples/my_file.parquet\").save()\n            artifact_v2.versions.to_dataframe()  # see all versions\n\n        You can write artifacts to **non-default storage locations** by passing the `storage` argument::\n\n            storage_loc = ln.Storage.get(root=\"s3://my_bucket\")  # get storage location, or create via ln.Storage(root=\"s3://my_bucket\").save()\n            ln.Artifact(\"./my_file.parquet\", key=\"examples/my_file.parquet\", storage=storage_loc).save()  # upload to s3://my_bucket\n\n    Notes:\n\n        .. _storage-formats-note:\n\n        .. dropdown:: Storage formats & object types\n\n            The `Artifact` registry tracks the storage format via :attr:`suffix` and an abstract object type via :attr:`otype`.\n\n            ================  ======================================  ================  ====================================================================\n            description       :attr:`suffix`                          :attr:`otype`     Python type examples\n            ================  ======================================  ================  ====================================================================\n            table             `.csv`, `.tsv`, `.parquet`, `.ipc`      `\"DataFrame\"`     `pandas.DataFrame`, `polars.DataFrame`, `pyarrow.Table`\n            annotated matrix  `.h5ad`, `.zarr`, `.h5mu`               `\"AnnData\"`       `anndata.AnnData`\n            stacked matrix    `.zarr`                                 `\"MuData\"`        `mudata.MuData`\n                              `.tiledbsoma`                           `\"tiledbsoma\"`    `tiledbsoma.Experiment`\n            spatial data      `.zarr`                                 `\"SpatialData\"`   `spatialdata.SpatialData`\n            generic arrays    `.h5`, `.zarr`, `.tiledb`               ---               `h5py.Dataset`, `zarr.Array`, `tiledb.Array`\n            unstructured      `.fastq`, `.pdf`, `.vcf`, `.html`       ---               ---\n            ================  ======================================  ================  ====================================================================\n\n            You can map storage formats onto **R types**, e.g., an `AnnData` might be accessed via `anndataR`.\n\n            Because `otype` accepts any `str`, you can define custom object types that enable queries & logic\n            that you need, e.g., `\"SingleCellExperiment\"` or `\"MyCustomZarrDataStructure\"`.\n\n            LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).\n\n        .. dropdown:: Will artifacts get duplicated?\n\n            If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact.\n            Exception: empty files are not deduplicated by hash and create a new artifact.\n\n            In concurrent workloads where the same artifact is created repeatedly at the exact same time, `.save()`\n            detects the duplication and will return the existing artifact.\n\n        .. dropdown:: I cannot come up with a good file name, can I avoid mapping artifacts into a hierarchy?\n\n            Sometimes you want to **avoid mapping the artifact into a path hierarchy**. You can do so by omitting the `key` argument and only passing `description`.\n            However, note that a shared `description` does not trigger mapping artifacts into the same version family.\n\n                artifact = ln.Artifact(\"./my_folder\", description=\"My folder\").save()\n                artifact_v2 = ln.Artifact(\"./my_folder\", revises=old_artifact).save()  # need to version based on `revises`, a shared description does not trigger a new version\n\n        .. dropdown:: Why does the constructor look the way it looks?\n\n            It's inspired by APIs building on AWS S3.\n\n            Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.\n\n            In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::\n\n                # signature: S3.Bucket.upload_file(filepath, key)\n                import boto3\n                s3 = boto3.resource('s3')\n                bucket = s3.Bucket('mybucket')\n                bucket.upload_file('/tmp/hello.txt', 'hello.txt')\n\n            In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::\n\n                # signature: quilt3.Bucket.put_file(key, filepath)\n                import quilt3\n                bucket = quilt3.Bucket('mybucket')\n                bucket.put_file('hello.txt', '/tmp/hello.txt')\n\n    See Also:\n        :class:`~lamindb.Storage`\n            Storage locations for artifacts.\n        :class:`~lamindb.Collection`\n            Collections of artifacts.\n        :meth:`~lamindb.Artifact.from_dir`\n            Bulk-create artifacts for each file in a directory.\n        :meth:`~lamindb.Artifact.from_dataframe`\n            Create an artifact from a `DataFrame`.\n        :meth:`~lamindb.Artifact.from_anndata`\n            Create an artifact from an `AnnData`.\n        :meth:`~lamindb.Artifact.from_spatialdata`\n            Create an artifact from a `SpatialData`.\n        :meth:`~lamindb.Artifact.from_mudata`\n            Create an artifact from a `MuData`.\n        :meth:`~lamindb.Artifact.from_tiledbsoma`\n            Create an artifact from a `tiledbsoma` store.\n        :meth:`~lamindb.Artifact.from_lazy`\n            Create a lazy artifact for streaming to auto-generated internal paths.\n\n    \"\"\"\n\n    class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):\n        abstract = False\n        app_label = \"lamindb\"\n        constraints = [\n            # a simple hard unique constraint on `hash` clashes with the fact\n            # that pipelines sometimes aim to ingest the exact same file in different\n            # folders\n            # the conditional composite constraint allows duplicating files in different parts of the\n            # file hierarchy, but errors if the same file is to be registered with the same key\n            # In SQL, NULL values are treated specially in unique constraints.\n            # Multiple NULL values are not considered equal to each other for uniqueness purposes.\n            # For non-NULL keys\n            models.UniqueConstraint(\n                fields=[\"storage\", \"key\", \"hash\"],\n                condition=models.Q(key__isnull=False),\n                name=\"unique_artifact_storage_key_hash_not_null\",\n            ),\n            # For NULL keys (only storage + hash need to be unique)\n            models.UniqueConstraint(\n                fields=[\"storage\", \"hash\"],\n                condition=models.Q(key__isnull=True),\n                name=\"unique_artifact_storage_hash_null_key\",\n            ),\n        ]\n\n    _TRACK_FIELDS = (\"space_id\", \"is_latest\", \"suffix\", \"key\")\n\n    _len_full_uid: int = 20\n    _len_stem_uid: int = 16\n    _name_field: str = \"key\"\n\n    @property\n    def features(self) -> FeatureManager:\n        \"\"\"Feature manager.\n\n        Annotate an artifact with features::\n\n            artifact.features.set_values({\n                \"species\": \"human\",\n                \"scientist\": ['Barbara McClintock', 'Edgar Anderson'],\n                \"temperature\": 27.6,\n                \"experiment\": \"Experiment 1\"\n            })\n\n        Query artifacts by features::\n\n            ln.Artifact.filter(scientist=\"Barbara McClintock\")\n\n        Get all feature annotations as a dictionary::\n\n            d = artifact.features.get_values()\n\n        Get a value for a single feature::\n\n            organism = artifact.features[\"species\"]  # returns an Organism object, not \"human\"\n            temperature = artifact.features[\"temperature\"]  # returns a temperature value, a float\n\n        Note that `get_values()` returns identifiers for categorical values (for example, the string\n        \"human\" for an `Organism`), while the `[]` accessor returns the corresponding Python object.\n        See also :meth:`~lamindb.models.FeatureManager.set_values`.\n\n        .. dropdown:: Dataset features vs. external features\n\n            Features may or may not be stored in the dataset, i.e., the artifact content in storage.\n            If you pass a schema to :class:`~lamindb.Artifact.from_dataframe` you validate the columns of the\n            `DataFrame` and annotate with values parsed from these columns.\n            `artifact.features.set_values()`, by contrast, does **not** validate the content of the artifact.\n\n        \"\"\"\n        from ._feature_manager import FeatureManager\n\n        return FeatureManager(self)\n\n    @property\n    def labels(self) -> LabelManager:\n        \"\"\"Label manager.\n\n        A way to access all label annotations of an artifact, irrespective of their type.\n\n        To annotate with labels, use the type-specific accessor,\n        for example::\n\n            experiment = ln.Record(name=\"Experiment 1\").save()\n            artifact.records.add(experiment)\n            project = ln.Project(name=\"Project A\").save()\n            artifact.projects.add(project)\n        \"\"\"\n        from ._label_manager import LabelManager\n\n        return LabelManager(self)\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(\n        editable=False, unique=True, db_index=True, max_length=_len_full_uid\n    )\n    \"\"\"A universal random id.\"\"\"\n    # the max length of 1024 equals the max length of a S3 key\n    key: str | None = CharField(db_index=True, null=True, max_length=1024)\n    \"\"\"A (virtual) relative file path within the artifact's storage location.\n\n    Setting a `key` is useful to automatically group artifacts into a version family.\n\n    LaminDB defaults to a virtual file path to make renaming of data in object storage easy.\n\n    If you register existing files in a storage location, the `key` equals the\n    actual filepath on the underyling filesytem or object store.\n    \"\"\"\n    _real_key: str | None = CharField(db_index=True, null=True, max_length=1024)\n    \"\"\"An optional real storage key.\"\"\"\n    # db_index on description because sometimes we query for equality in the case of artifacts\n    description: str | None = TextField(null=True, db_index=True)\n    \"\"\"A description.\"\"\"\n    storage: Storage = ForeignKey(\n        Storage, PROTECT, related_name=\"artifacts\", editable=False\n    )\n    \"\"\"Storage location, e.g. an S3 or GCP bucket or a local directory ← :attr:`~lamindb.Storage.artifacts`.\"\"\"\n    suffix: str = CharField(max_length=30, db_index=True, editable=False)\n    # Initially, we thought about having this be nullable to indicate folders\n    # But, for instance, .zarr is stored in a folder that ends with a .zarr suffix\n    \"\"\"The path suffix or an empty string if no suffix exists.\n\n    This is either a file suffix (`\".csv\"`, `\".h5ad\"`, etc.) or the empty string \"\".\n    \"\"\"\n    kind: ArtifactKind | str | None = CharField(\n        max_length=20,\n        db_index=True,\n        null=True,\n    )\n    \"\"\":class:`~lamindb.base.types.ArtifactKind` or custom `str` value (default `None`).\"\"\"\n    otype: (\n        Literal[\"DataFrame\", \"AnnData\", \"MuData\", \"SpatialData\", \"tiledbsoma\"]\n        | str\n        | None\n    ) = CharField(max_length=64, db_index=True, null=True, editable=False)\n    \"\"\"The object type represented as a string.\n\n    The field is automatically set when using the `from_dataframe()`, `from_anndata()`, ... constructors.\n    Unstructured artifacts have `otype=None`.\n\n    The field also accepts custom `str` values to allow for building logic around them in third-party packages.\n\n    See section `storage formats & object types <storage-formats-note_>`__ for more background.\n    \"\"\"\n    size: int | None = BigIntegerField(\n        null=True, db_index=True, default=None, editable=False\n    )\n    \"\"\"The size in bytes.\n\n    Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc.\n    \"\"\"\n    hash: str | None = CharField(\n        max_length=HASH_LENGTH, db_index=True, null=True, editable=False\n    )\n    \"\"\"The hash or pseudo-hash of the artifact content in storage.\n\n    Useful to ascertain integrity and avoid duplication.\n\n    Different versions of the artifact have different hashes.\n    \"\"\"\n    n_files: int | None = BigIntegerField(\n        null=True, db_index=True, default=None, editable=False\n    )\n    \"\"\"The number of files for folder-like artifacts.\n\n    Is `None` for file-like artifacts.\n\n    Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.\n    \"\"\"\n    n_observations: int | None = BigIntegerField(\n        null=True, db_index=True, default=None, editable=False\n    )\n    \"\"\"The number of observations in this artifact.\n\n    Typically, this denotes the first array dimension.\n    \"\"\"\n    _hash_type: str | None = CharField(\n        max_length=30, db_index=True, null=True, editable=False\n    )\n    \"\"\"Type of hash.\"\"\"\n    run: Run | None = ForeignKey(\n        Run,\n        PROTECT,\n        related_name=\"output_artifacts\",\n        null=True,\n        default=None,\n        editable=False,\n    )\n    \"\"\"The run that created the artifact ← :attr:`~lamindb.Run.output_artifacts`.\"\"\"\n    input_of_runs: RelatedManager[Run] = models.ManyToManyField(\n        Run, related_name=\"input_artifacts\"\n    )\n    \"\"\"The runs that use this artifact as an input ← :attr:`~lamindb.Run.input_artifacts`.\"\"\"\n    recreating_runs: RelatedManager[Run] = models.ManyToManyField(\n        \"Run\",\n        related_name=\"recreated_artifacts\",\n    )\n    \"\"\"The runs that re-created the artifact after its initial creation ← :attr:`~lamindb.Run.recreated_artifacts`.\"\"\"\n    collections: RelatedManager[Collection]\n    \"\"\"The collections that this artifact is part of ← :attr:`~lamindb.Collection.artifacts`.\"\"\"\n    schema: Schema | None = ForeignKey(\n        Schema,\n        PROTECT,\n        null=True,\n        default=None,\n        related_name=\"validated_artifacts\",\n    )\n    \"\"\"The validating schema of this artifact ← :attr:`~lamindb.Schema.validated_artifacts`.\n\n    The validating schema is helpful to query artifacts that were validated by the same schema.\n    \"\"\"\n    schemas: RelatedManager[Schema] = models.ManyToManyField(\n        Schema, related_name=\"artifacts\", through=\"ArtifactSchema\"\n    )\n    \"\"\"The inferred schemas of this artifact ← :attr:`~lamindb.Schema.artifacts`.\n\n    The inferred schemas are helpful to answer the question: \"Which features are present in the artifact?\"\n\n    The validating schema typically allows a range of valid actual dataset schemas.\n    The inferred schemas link the actual schemas of the artifact, and are\n    auto-generated by parsing the artifact content during validation.\n    \"\"\"\n    json_values: RelatedManager[JsonValue] = models.ManyToManyField(\n        JsonValue, through=\"ArtifactJsonValue\", related_name=\"artifacts\"\n    )\n    \"\"\"The feature-indexed JSON values annotating this artifact ← :attr:`~lamindb.JsonValue.artifacts`.\"\"\"\n    _key_is_virtual: bool = BooleanField()\n    \"\"\"Indicates whether `key` is virtual or part of an actual file path.\"\"\"\n    # be mindful that below, passing related_name=\"+\" leads to errors\n    _actions: RelatedManager[Artifact] = models.ManyToManyField(\n        \"self\", symmetrical=False, related_name=\"_action_targets\"\n    )\n    \"\"\"The actions to attach for the UI.\"\"\"\n    created_by: User = ForeignKey(\n        \"lamindb.User\",\n        PROTECT,\n        default=current_user_id,\n        related_name=\"created_artifacts\",\n        editable=False,\n    )\n    \"\"\"The creator of this artifact ← :attr:`~lamindb.User.created_artifacts`.\"\"\"\n    _overwrite_versions: bool = BooleanField(default=None)\n    \"\"\"See corresponding property `overwrite_versions`.\"\"\"\n    ulabels: RelatedManager[ULabel]\n    \"\"\"The ulabels annotating this artifact ← :attr:`~lamindb.ULabel.artifacts`.\"\"\"\n    users: RelatedManager[User]\n    \"\"\"The users annotating this artifact ← :attr:`~lamindb.User.artifacts`.\"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"The projects annotating this artifact ← :attr:`~lamindb.Project.artifacts`.\"\"\"\n    references: RelatedManager[Reference]\n    \"\"\"The references annotating this artifact ← :attr:`~lamindb.Reference.artifacts`.\"\"\"\n    records: RelatedManager[Record]\n    \"\"\"The records annotating this artifact ← :attr:`~lamindb.Record.artifacts`.\"\"\"\n    runs: RelatedManager[Run]\n    \"\"\"The runs annotating this artifact ← :attr:`~lamindb.Run.artifacts`.\"\"\"\n    linked_by_runs: RelatedManager[Run]\n    \"\"\"The runs linking this artifact ← :attr:`~lamindb.Run.linked_by_artifacts`.\"\"\"\n    artifacts: RelatedManager[Artifact] = models.ManyToManyField(\n        \"Artifact\",\n        through=\"ArtifactArtifact\",\n        symmetrical=False,\n        related_name=\"linked_by_artifacts\",\n    )\n    \"\"\"The annotating artifacts of this artifact ← :attr:`~lamindb.Artifact.linked_by_artifacts`.\"\"\"\n    linked_by_artifacts: RelatedManager[Artifact]\n    \"\"\"The artifacts annotated by this artifact ← :attr:`~lamindb.Artifact.artifacts`.\"\"\"\n    linked_in_records: RelatedManager[Record] = models.ManyToManyField(\n        \"Record\", through=\"RecordArtifact\", related_name=\"linked_artifacts\"\n    )\n    \"\"\"The records linking this artifact as a feature value ← :attr:`~lamindb.Record.linked_artifacts`.\"\"\"\n    ablocks: RelatedManager[ArtifactBlock]\n    \"\"\"Attached blocks ← :attr:`~lamindb.ArtifactBlock.artifact`.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        path: AnyPathStr,\n        *,\n        key: str | None = None,\n        description: str | None = None,\n        kind: ArtifactKind | str | None = None,\n        features: dict[str, Any] | None = None,\n        schema: Schema | None = None,\n        revises: Artifact | None = None,\n        overwrite_versions: bool | None = None,\n        run: Run | False | None = None,\n        storage: Storage | None = None,\n        branch: Branch | None = None,\n        space: Space | None = None,\n        skip_hash_lookup: bool = False,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        # check whether we are called with db args\n        if len(args) == len(self._meta.concrete_fields):\n            super().__init__(*args, **kwargs)\n            return None\n        # now proceed with the user-facing constructor\n        if len(args) > 1:\n            raise ValueError(\"Only one non-keyword arg allowed: path\")\n\n        if \"data\" in kwargs:\n            warnings.warn(\n                \"`data` argument was renamed to `path` and will be removed in a future release.\",\n                DeprecationWarning,\n                stacklevel=2,\n            )\n            path = kwargs.pop(\"data\")\n        else:\n            path = kwargs.pop(\"path\") if len(args) == 0 else args[0]\n\n        kind: str = kwargs.pop(\"kind\", None)\n        key: str | None = kwargs.pop(\"key\", None)\n        using_key = kwargs.pop(\"using_key\", None)\n        description: str | None = kwargs.pop(\"description\", None)\n        revises: Artifact | None = kwargs.pop(\"revises\", None)\n        if revises is not None:\n            if not isinstance(revises, Artifact):\n                raise TypeError(\"`revises` has to be of type `Artifact`\")\n            if description is None:\n                description = revises.description\n        overwrite_versions: bool | None = kwargs.pop(\"overwrite_versions\", None)\n        version_tag: str | None = kwargs.pop(\"version_tag\", kwargs.pop(\"version\", None))\n        features: dict[str, Any] | None = kwargs.pop(\"features\", None)\n        skip_hash_lookup: bool = kwargs.pop(\"skip_hash_lookup\", False)\n        to_disk_kwargs: dict[str, Any] | None = kwargs.pop(\"to_disk_kwargs\", None)\n        format = kwargs.pop(\"format\", None)\n        _key_is_virtual = kwargs.pop(\"_key_is_virtual\", None)\n        _is_internal_call = kwargs.pop(\"_is_internal_call\", False)\n        skip_check_exists = kwargs.pop(\"skip_check_exists\", False)\n\n        if key is not None and _s().AUTO_KEY_PREFIX in key:\n            raise ValueError(\n                f\"Do not pass key that contains a managed storage path in `{_s().AUTO_KEY_PREFIX}`\"\n            )\n        # below is for internal calls that require defining the storage location\n        # ahead of constructing the Artifact\n        if isinstance(path, (str, Path, UPath)) and _s().AUTO_KEY_PREFIX in str(path):\n            if _is_internal_call:\n                if _key_is_virtual is False:\n                    raise ValueError(\n                        \"Do not pass _key_is_virtual=False with _is_internal_call=True.\"\n                    )\n                is_automanaged_path = True\n                user_provided_key = key\n                key = None\n            else:\n                raise ValueError(\n                    f\"Do not pass path inside the `{_s().AUTO_KEY_PREFIX}` directory.\"\n                )\n        else:\n            is_automanaged_path = False\n\n        # validate external features if passed with a schema\n        schema: Schema | None = _sqlrecord_or_id(\n            Schema, kwargs.pop(\"schema\", None), kwargs.pop(\"schema_id\", None)\n        )\n        if features is not None:\n            self._external_features = features\n            if schema is not None:\n                from lamindb.curators.core import ExperimentalDictCurator\n\n                validation_schema = schema\n                ExperimentalDictCurator(features, validation_schema).validate()\n        # check_type is False because run can be False also, see get_run\n        run: Run | None | bool = _sqlrecord_or_id(\n            Run, kwargs.pop(\"run\", None), kwargs.pop(\"run_id\", None), check_type=False\n        )\n        branch: Branch | None = _sqlrecord_or_id(\n            Branch, kwargs.pop(\"branch\", None), kwargs.pop(\"branch_id\", None)\n        )\n        space: Space | None = _sqlrecord_or_id(\n            Space, kwargs.pop(\"space\", None), kwargs.pop(\"space_id\", None)\n        )\n        storage: Storage | None = _sqlrecord_or_id(\n            Storage, kwargs.pop(\"storage\", None), kwargs.pop(\"storage_id\", None)\n        )\n        storage_was_passed = False\n        if storage is not None:\n            storage_was_passed = True\n        elif (\n            setup_settings.instance.keep_artifacts_local\n            and setup_settings.instance._local_storage is not None\n        ):\n            storage = setup_settings.instance.local_storage.record\n        else:\n            storage = setup_settings.instance.storage.record\n        if space is None:\n            from lamindb import context as run_context\n\n            if run_context.space is not None:\n                space = run_context.space\n            elif setup_settings.space is not None:\n                space = setup_settings.space\n        # space - storage consistency is also checked in .save() when the space is changed\n        if space is not None and space.id != storage.space_id:\n            if storage_was_passed:\n                logger.warning(\n                    \"storage argument ignored as storage information from space takes precedence\"\n                )\n            storage_locs_for_space = Storage.filter(\n                space=space, instance_uid=setup_settings.instance.uid\n            ).order_by(\"id\")\n            n_storage_locs_for_space = storage_locs_for_space.count()\n            if n_storage_locs_for_space == 0:\n                raise NoStorageLocationForSpace(\n                    \"No storage location found for space.\\n\"\n                    \"Either create one via ln.Storage(root='create-s3', space=space).save()\\n\"\n                    \"Or start managing access to an existing storage location via the space: storage_loc.space = space; storage.save()\"\n                )\n            else:\n                storage = storage_locs_for_space.first()\n                if n_storage_locs_for_space > 1:\n                    other_storage_locs = \",\".join(\n                        f\"{s.root}\" for s in storage_locs_for_space[1:]\n                    )\n                    logger.warning(\n                        f\"more than one storage location is managed by this instance for space {space},\\n\"\n                        f\"choosing root={storage.root}\\n\"\n                    )\n                    logger.important_hint(\n                        f\"to choose one of the other storage locations ({other_storage_locs}), pass `storage` to the Artifact constructor\"\n                    )\n        otype = kwargs.pop(\"otype\") if \"otype\" in kwargs else None\n        if isinstance(path, str) and path.startswith(\"s3:///\"):\n            # issue in Groovy / nf-lamin producing malformed S3 paths\n            # https://laminlabs.slack.com/archives/C08J590666Q/p1751315027830849?thread_ts=1751039961.479259&cid=C08J590666Q\n            path = path.replace(\"s3:///\", \"s3://\")\n        otype = check_otype_artifact(\n            data=path, otype=otype, cloud_warning=not _is_internal_call\n        )\n        if \"type\" in kwargs:\n            logger.warning(\"`type` will be removed soon, please use `kind`\")\n            kind = kwargs.pop(\"type\")\n        if not len(kwargs) == 0:\n            valid_keywords = \", \".join([val[0] for val in _get_record_kwargs(Artifact)])\n            raise FieldValidationError(\n                f\"Only {valid_keywords} can be passed, you passed: {kwargs}\"\n            )\n        if revises is not None and key is not None and revises.key != key:\n            logger.warning(f\"renaming artifact from '{revises.key}' to {key}\")\n\n        provisional_uid, revises = create_uid(revises=revises, version_tag=version_tag)\n        run = get_run(run)\n        kwargs_or_artifact, privates = get_artifact_kwargs_from_data(\n            data=path,\n            key=key,\n            run=run,\n            format=format,\n            provisional_uid=provisional_uid,\n            version_tag=version_tag,\n            storage=storage,\n            using_key=using_key,\n            skip_check_exists=skip_check_exists,\n            overwrite_versions=overwrite_versions,\n            skip_hash_lookup=skip_hash_lookup,\n            to_disk_kwargs=to_disk_kwargs,\n            key_is_virtual=_key_is_virtual,\n        )\n\n        def set_private_attributes():\n            if path is not None and \"local_filepath\" in privates:\n                self._local_filepath = privates[\"local_filepath\"]\n                self._cloud_filepath = privates[\"cloud_filepath\"]\n                self._memory_rep = privates[\"memory_rep\"]\n                self._to_store = not privates[\"check_path_in_storage\"]\n\n                if (\n                    self._to_store\n                    and not privates[\"is_artifact_storage_managed_by_current_instance\"]\n                ):\n                    raise ValueError(\n                        \"Cannot create an artifact in a storage location that is not managed by the current instance.\"\n                    )\n\n        # an object with the same hash already exists\n        if isinstance(kwargs_or_artifact, Artifact):\n            from .sqlrecord import init_self_from_db, update_attributes\n\n            init_self_from_db(self, kwargs_or_artifact)\n            # update key from inferred value\n            key = privates.pop(\"key\")\n            # adding \"key\" here is dangerous because key might be auto-populated\n            attr_to_update = {\"description\": description}\n            if schema is not None:\n                attr_to_update[\"schema\"] = schema\n            if kwargs_or_artifact._key_is_virtual and kwargs_or_artifact.key is None:\n                attr_to_update[\"key\"] = key\n            elif self.key != key and key is not None:\n                if not self.path.exists():\n                    logger.warning(f\"updating previous key {self.key} to new key {key}\")\n                    self.key = key\n                    # Keep tracked state aligned with this internal dedup-time key\n                    # normalization so save() doesn't treat it as a user key edit.\n                    self._original_values[\"key\"] = key\n                    assert self.path.exists(), (  # noqa: S101\n                        f\"The underlying file for artifact {self} does not exist anymore, clean up the artifact record.\"\n                    )  # noqa: S101\n                else:\n                    logger.warning(\n                        f\"key {self.key} on existing artifact differs from passed key {key}, keeping original key; update manually if needed or pass skip_hash_lookup if you want to duplicate the artifact\"\n                    )\n            update_attributes(self, attr_to_update)\n            # an existing artifact might have an imcomplete upload and hence we should\n            # re-populate _local_filepath because this is what triggers the upload\n            set_private_attributes()\n            populate_subsequent_run(self, run)\n            return None\n        else:\n            kwargs = kwargs_or_artifact\n            kwargs[\"schema\"] = schema\n\n        if revises is None:\n            revises = kwargs_or_artifact.pop(\"revises\")\n\n        set_private_attributes()\n\n        if is_automanaged_path and _is_internal_call:\n            kwargs[\"_key_is_virtual\"] = True\n            assert _s().AUTO_KEY_PREFIX in kwargs[\"key\"]  # noqa: S101\n            uid = (\n                kwargs[\"key\"]\n                .replace(_s().AUTO_KEY_PREFIX, \"\")\n                .replace(kwargs[\"suffix\"], \"\")\n            )\n            kwargs[\"key\"] = user_provided_key\n            if revises is not None:\n                assert uid.startswith(revises.stem_uid)  # noqa: S101\n            if len(uid) == 16:\n                if revises is None:\n                    uid += \"0000\"\n                else:\n                    uid, revises = create_uid(revises=revises, version_tag=version_tag)\n            kwargs[\"uid\"] = uid\n\n        # only set key now so that we don't perform a look-up on it in case revises is passed\n        if revises is not None and revises.key is not None and kwargs[\"key\"] is None:\n            kwargs[\"key\"] = revises.key\n\n        kwargs[\"kind\"] = kind\n        kwargs[\"version_tag\"] = version_tag\n        kwargs[\"description\"] = description\n        kwargs[\"branch\"] = branch\n        kwargs[\"space\"] = space\n        kwargs[\"otype\"] = otype\n        kwargs[\"revises\"] = revises\n        # this check needs to come down here because key might be populated from an\n        # existing file path during get_artifact_kwargs_from_data()\n        if (\n            kwargs[\"key\"] is None\n            and kwargs[\"description\"] is None\n            and kwargs[\"run\"] is None\n        ):\n            raise ValueError(\"Pass one of key, run or description as a parameter\")\n\n        super().__init__(**kwargs)\n\n    @property\n    def transform(self) -> Transform | None:\n        \"\"\"Transform whose run created the artifact.\"\"\"\n        return self.run.transform if self.run is not None else None\n\n    @property\n    def overwrite_versions(self) -> bool:\n        \"\"\"Indicates whether to keep or overwrite versions.\n\n        It defaults to `False` for file-like artifacts and to `True` for folder-like artifacts.\n\n        Note that this requires significant storage space for large folders with\n        many duplicated files. Currently, `lamindb` does *not* de-duplicate files across\n        versions as in git, but keeps all files for all versions of the folder in storage.\n        \"\"\"\n        return self._overwrite_versions\n\n    @property\n    def _storage_ongoing(self) -> bool:\n        \"\"\"Whether the artifact is still in the process of being saved to storage (uploaded for cloud storage).\n\n        - `True`: write started but not completed\n        - `False`: storage completed or not yet started\n\n        In the JSON `_aux`field, `True` is represented as `{\"so\": 1}` and `False` as\n        an absent `\"so\"` key.\n        \"\"\"\n        if self._aux is None:\n            return False\n        if self._aux.get(\"so\") == 1:\n            return True\n        else:\n            return False\n\n    @_storage_ongoing.setter\n    def _storage_ongoing(self, value: bool | None) -> None:\n        if value is None or value is False:\n            if self._aux is not None and \"so\" in self._aux:\n                del self._aux[\"so\"]\n                if not self._aux:\n                    self._aux = None\n        else:\n            if self._aux is None:\n                self._aux = {}\n            assert value is True\n            self._aux[\"so\"] = 1\n\n    @property\n    @deprecated(\"schemas\")\n    def feature_sets(self):\n        return self.schemas\n\n    @property\n    def path(self) -> UPath:\n        \"\"\"Path.\n\n        Example::\n\n            import lamindb as ln\n\n            # File in cloud storage, here AWS S3:\n            artifact = ln.Artifact(\"s3://my-bucket/my-file.csv\").save()\n            artifact.path\n            #S3QueryPath('s3://my-bucket/my-file.csv')\n\n            # File in local storage:\n            ln.Artifact(\"./myfile.csv\", key=\"myfile.csv\").save()\n            artifact.path\n            #> PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/myfile.csv')\n        \"\"\"\n        filepath, _ = _s().filepath_from_artifact(self, using_key=settings._using_key)\n        return filepath\n\n    @property\n    def _cache_path(self) -> UPath:\n        filepath, cache_key = _s().filepath_cache_key_from_artifact(\n            self, using_key=settings._using_key\n        )\n        if isinstance(filepath, LocalPathClasses):\n            return filepath\n        return setup_settings.paths.cloud_to_local_no_update(\n            filepath, cache_key=cache_key\n        )\n\n    @strict_classmethod\n    def get(\n        cls,\n        idlike: int | str | None = None,\n        *,\n        key: str | None = None,\n        path: AnyPathStr | None = None,\n        is_run_input: bool | Run = False,\n        **expressions,\n    ) -> Artifact:\n        \"\"\"Get a single artifact.\n\n        Args:\n            idlike: Either a uid stub, uid or an integer id.\n            key: An optional key to query for.\n            path: An optional full path to query for, including the storage root.\n            is_run_input: Whether to track this artifact as run input.\n            expressions: Other fields and values passed as Django query expressions.\n\n        Raises:\n            :exc:`lamindb.errors.DoesNotExist`: In case no matching record is found.\n\n        See Also:\n            - Guide: :doc:`registries`\n            - Method in `SQLRecord` base class: :meth:`~lamindb.models.SQLRecord.get`\n\n        Examples:\n\n            ::\n\n                artifact = ln.Artifact.get(\"tCUkRcaEjTjhtozp\")       # gets latest version for family tCUkRcaEjTjhtozp\n                artifact = ln.Artifact.get(\"tCUkRcaEjTjhtozp0005\")   # gets version 0005 for family tCUkRcaEjTjhtozp\n                artifact = ln.Artifact.get(key=\"examples/my_file.parquet\")               # gets latest version for a key\n                artifact = ln.Artifact.get(key=\"examples/my_file.parquet\", version=\"2\")  # pass a version tag\n                artifact = ln.Artifact.get(path=\"s3://bucket/folder/adata.h5ad\")\n        \"\"\"\n        if key is not None:\n            expressions[\"key\"] = key\n        if path is not None:\n            expressions[\"path\"] = path\n        return QuerySet(model=cls).get(idlike, is_run_input=is_run_input, **expressions)\n\n    @strict_classmethod\n    def filter(\n        cls,\n        *queries,\n        **expressions,\n    ) -> QuerySet:\n        \"\"\"Query a set of artifacts.\n\n        Args:\n            *queries: `Q` expressions.\n            **expressions: Features & fields via the Django query syntax.\n\n        See Also:\n            - Guide: :doc:`docs:registries`\n\n        Examples:\n\n            Query by fields::\n\n                ln.Arfifact.filter(key=\"examples/my_file.parquet\")\n\n            Query by features::\n\n                ln.Arfifact.filter(cell_type_by_model__name=\"T cell\")\n\n        \"\"\"\n        # from Registry metaclass\n        return type(cls).filter(cls, *queries, **expressions)\n\n    @classmethod\n    def from_lazy(\n        cls,\n        suffix: str,\n        overwrite_versions: bool,\n        key: str | None = None,\n        description: str | None = None,\n        run: Run | None = None,\n        **kwargs,\n    ) -> LazyArtifact:\n        \"\"\"Create a lazy artifact for streaming to auto-generated internal paths.\n\n        This is needed when it is desirable to stream to a `lamindb` auto-generated internal path\n        and register the path as an artifact. It allows writing directly into the default cloud\n        (or local) storage of the current instance and then saving as an :class:`~lamindb.Artifact`.\n\n        The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact\n        on `.save()` with the provided arguments.\n\n        Args:\n            suffix: The suffix for the auto-generated internal path\n            overwrite_versions: Whether to overwrite versions.\n            key: An optional key to reference the artifact.\n            description: A description.\n            run: The run that creates the artifact.\n            **kwargs: Other keyword arguments for the artifact to be created.\n\n        Examples:\n\n            Local storage: create a lazy artifact, stream to the path, then save::\n\n                lazy = ln.Artifact.from_lazy(suffix=\".zarr\", overwrite_versions=True, key=\"mydata.zarr\")\n                zarr.open(lazy.path, mode=\"w\")[\"test\"] = np.array([\"test\"])\n                artifact = lazy.save()\n\n            Cloud storage (e.g. S3): use `zarr.storage.FsspecStore` to stream arrays::\n\n                lazy = ln.Artifact.from_lazy(suffix=\".zarr\", overwrite_versions=True, key=\"mydata.zarr\")\n                store = zarr.storage.FsspecStore.from_url(lazy.path.as_posix())\n                group = zarr.open(store, mode=\"w\")\n                group[\"ones\"] = np.ones(3)\n                artifact = lazy.save()\n        \"\"\"\n        args = {\"key\": key, \"description\": description, \"run\": run, **kwargs}\n        return LazyArtifact(suffix, overwrite_versions, **args)\n\n    @classmethod\n    def from_dataframe(\n        cls,\n        df: pd.DataFrame | AnyPathStr,\n        *,\n        key: str | None = None,\n        description: str | None = None,\n        run: Run | None = None,\n        revises: Artifact | None = None,\n        schema: Schema | Literal[\"valid_features\"] | None = None,\n        features: dict[str, Any] | None = None,\n        parquet_kwargs: dict[str, Any] | None = None,\n        csv_kwargs: dict[str, Any] | None = None,\n        **kwargs,\n    ) -> Artifact:\n        \"\"\"Create from `DataFrame`, optionally validate & annotate.\n\n        Sets `.otype` to `\"DataFrame\"` and populates `.n_observations`.\n\n        Args:\n            df: A `DataFrame` object or an `AnyPathStr` pointing to a `DataFrame` in storage, e.g. a `.parquet` or `.csv` file.\n            key: A relative path within default storage, e.g., `\"myfolder/myfile.parquet\"`.\n            description: A description.\n            revises: An old version of the artifact.\n            run: The run that creates the artifact.\n            schema: A schema that defines how to validate & annotate.\n            features: Additional external features to annotate the artifact via :class:`~lamindb.models.FeatureManager.set_values` (keys can be feature names or `Feature` objects).\n            parquet_kwargs: Additional keyword arguments passed to the\n                `pandas.DataFrame.to_parquet` method, which are passed\n                on to `pyarrow.parquet.ParquetWriter`.\n            csv_kwargs: Additional keyword arguments passed to the `pandas.DataFrame.to_csv` method.\n\n        Examples:\n\n            No validation and annotation::\n\n                ln.Artifact.from_dataframe(df, key=\"examples/dataset1.parquet\").save()\n\n            With validation and annotation::\n\n                ln.Artifact.from_dataframe(df, key=\"examples/dataset1.parquet\", schema=\"valid_features\").save()\n\n            Under-the-hood, this uses the following build-in schema (:func:`~lamindb.examples.schemas.valid_features`)::\n\n                schema = ln.Schema(name=\"valid_features\", itype=\"Feature\").save()\n\n            External features:\n\n            .. literalinclude:: scripts/curate_dataframe_external_features.py\n               :language: python\n\n            Parquet kwargs:\n\n            .. literalinclude:: scripts/test_artifact_parquet.py\n               :language: python\n        \"\"\"\n        if \"format\" not in kwargs and key is not None and key.endswith(\".csv\"):\n            kwargs[\"format\"] = \".csv\"\n        if schema == \"valid_features\":\n            from lamindb import examples\n\n            schema = examples.schemas.valid_features()\n\n        to_disk_kwargs: dict[str, Any] = parquet_kwargs or csv_kwargs\n        artifact = Artifact(  # type: ignore\n            path=df,\n            key=key,\n            run=run,\n            description=description,\n            revises=revises,\n            otype=\"DataFrame\",\n            kind=\"dataset\",\n            to_disk_kwargs=to_disk_kwargs,\n            **kwargs,\n        )\n        if data_is_dataframe(df):\n            artifact.n_observations = len(df)\n        else:\n            # must be a str or path\n            path = create_path(df)\n            if path.suffix == \".parquet\":\n                import pyarrow.parquet as pq\n\n                with path.open(\"rb\") as f:\n                    artifact.n_observations = pq.read_metadata(f).num_rows\n            else:\n                # csv/tsv/others have no metadata and would require a full expensive read\n                artifact.n_observations = None\n        if features is not None:\n            artifact._external_features = features\n        if schema is not None:\n            from lamindb.curators.core import DataFrameCurator\n\n            if not artifact._state.adding and artifact.suffix != \".parquet\":\n                logger.warning(\n                    f\"not re-validating existing artifact as it was stored as {artifact.suffix}, \"\n                    \"which does not maintain categorical dtype information\"\n                )\n                return artifact\n\n            curator = DataFrameCurator(artifact, schema, features=features)\n            curator.validate()\n            artifact.schema = schema\n            artifact._curator = curator\n        return artifact\n\n    @classmethod\n    @deprecated(\"from_dataframe\")\n    def from_df(\n        cls,\n        df: pd.DataFrame,\n        *,\n        key: str | None = None,\n        description: str | None = None,\n        run: Run | None = None,\n        revises: Artifact | None = None,\n        schema: Schema | None = None,\n        **kwargs,\n    ) -> Artifact:\n        return cls.from_dataframe(\n            df,\n            key=key,\n            description=description,\n            run=run,\n            revises=revises,\n            schema=schema,\n            **kwargs,\n        )\n\n    @classmethod\n    def from_anndata(\n        cls,\n        adata: Union[AnnData, AnyPathStr],\n        *,\n        key: str | None = None,\n        description: str | None = None,\n        run: Run | None = None,\n        revises: Artifact | None = None,\n        schema: Schema\n        | Literal[\"ensembl_gene_ids_and_valid_features_in_obs\"]\n        | None = None,\n        format: Literal[\"h5ad\", \"zarr\", \"anndata.zarr\"] | None = None,\n        h5ad_kwargs: dict[str, Any] | None = None,\n        zarr_kwargs: dict[str, Any] | None = None,\n        **kwargs,\n    ) -> Artifact:\n        \"\"\"Create from `AnnData`, optionally validate & annotate.\n\n        Sets `.otype` to `\"AnnData\"` and populates `.n_observations`.\n\n        Args:\n            adata: An `AnnData` object or a path of AnnData-like.\n            key: A relative path within default storage, e.g., `\"myfolder/myfile.h5ad\"`.\n            description: A description.\n            revises: An old version of the artifact.\n            run: The run that creates the artifact.\n            schema: A schema that defines how to validate & annotate.\n            format: Storage format used when writing in-memory `AnnData`.\n                In-memory `AnnData` is first written to cache in this format, then saved to instance storage when calling `.save()`.\n                If `None`, infer from `key` suffix when available, otherwise default to `\"h5ad\"`.\n                If provided, suffix is formed as `\".\" + format` (e.g., `\"zarr\"` -> `\".zarr\"`).\n            h5ad_kwargs: Additional keyword arguments passed to the `anndata.AnnData.write_h5ad` method\n                when writing in-memory `AnnData` to cache.\n            zarr_kwargs: Additional keyword arguments passed to the `anndata.AnnData.write_zarr` method.\n                when writing in-memory `AnnData` to cache. Use `key` with suffix `.zarr` or pass `format=\"zarr\"` for this to work.\n\n        See Also:\n            :meth:`~lamindb.Collection`\n                Track collections.\n            :class:`~lamindb.Feature`\n                Track features.\n\n        Example:\n\n            Write H5AD with custom serialization settings::\n\n                ln.Artifact.from_anndata(\n                    adata,\n                    key=\"examples/dataset1.h5ad\",\n                    h5ad_kwargs={\"compression\": \"gzip\"},\n                ).save()\n\n            Write Zarr with custom chunking settings::\n\n                ln.Artifact.from_anndata(\n                    adata,\n                    key=\"examples/dataset1.zarr\",\n                    format=\"zarr\",\n                    zarr_kwargs={\"chunks\": [1024, 1024]},\n                ).save()\n\n            No validation and annotation::\n\n                ln.Artifact.from_anndata(adata, key=\"examples/dataset1.h5ad\").save()\n\n            With validation and annotation::\n\n                ln.Artifact.from_anndata(adata, key=\"examples/dataset1.h5ad\", schema=\"ensembl_gene_ids_and_valid_features_in_obs\").save()\n\n            Under-the-hood, this uses the following build-in schema (:func:`~lamindb.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs`):\n\n            .. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py\n               :language: python\n\n            This schema tranposes the `var` DataFrame during curation, so that one validates and annotates the columns of `var.T`, i.e., `[ENSG00000153563, ENSG00000010610, ENSG00000170458]`.\n            If one doesn't transpose, one would annotate the columns of `var`, i.e., `[gene_symbol, gene_type]`.\n\n            .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/gLyfToATM7WUzkWW0001.png\n               :width: 800px\n        \"\"\"\n        if not data_is_scversedatastructure(adata, \"AnnData\"):\n            raise ValueError(\n                \"data has to be an AnnData object or a path to AnnData-like\"\n            )\n\n        if schema == \"ensembl_gene_ids_and_valid_features_in_obs\":\n            from lamindb import examples\n\n            schema = (\n                examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs()\n            )\n\n        to_disk_kwargs: dict[str, Any] = h5ad_kwargs or zarr_kwargs\n        artifact = Artifact(  # type: ignore\n            path=adata,\n            key=key,\n            run=run,\n            description=description,\n            revises=revises,\n            otype=\"AnnData\",\n            kind=\"dataset\",\n            format=format,\n            to_disk_kwargs=to_disk_kwargs,\n            **kwargs,\n        )\n        # this is done instead of _anndata_n_observations(adata)\n        # because we need a proper path through create_path for cloud paths\n        # for additional upath options etc that create_path adds\n        obj_for_obs: AnnData | UPath\n        if hasattr(artifact, \"_memory_rep\") and artifact._memory_rep is not None:\n            obj_for_obs = artifact._memory_rep\n        else:\n            # returns ._local_filepath for local files\n            # and the proper path through create_path for cloud paths\n            obj_for_obs = artifact.path\n        from ..core.storage._anndata_accessor import _anndata_n_observations\n\n        artifact.n_observations = _anndata_n_observations(obj_for_obs)\n        if schema is not None:\n            from ..curators import AnnDataCurator\n\n            curator = AnnDataCurator(artifact, schema)\n            curator.validate()\n            artifact.schema = schema\n            artifact._curator = curator\n        return artifact\n\n    @classmethod\n    def from_mudata(\n        cls,\n        mdata: Union[MuData, AnyPathStr],\n        *,\n        key: str | None = None,\n        description: str | None = None,\n        run: Run | None = None,\n        revises: Artifact | None = None,\n        schema: Schema | None = None,\n        **kwargs,\n    ) -> Artifact:\n        \"\"\"Create from `MuData`, optionally validate & annotate.\n\n        Sets `.otype` to `\"MuData\"`.\n\n        Args:\n            mdata: A `MuData` object.\n            key: A relative path within default storage, e.g., `\"myfolder/myfile.h5mu\"`.\n            description: A description.\n            revises: An old version of the artifact.\n            run: The run that creates the artifact.\n            schema: A schema that defines how to validate & annotate.\n\n        See Also:\n            :meth:`~lamindb.Collection`\n                Track collections.\n            :class:`~lamindb.Feature`\n                Track features.\n\n        Example::\n\n            import lamindb as ln\n\n            mdata = ln.examples.datasets.mudata_papalexi21_subset()\n            artifact = ln.Artifact.from_mudata(mdata, key=\"mudata_papalexi21_subset.h5mu\").save()\n        \"\"\"\n        if not data_is_scversedatastructure(mdata, \"MuData\"):\n            raise ValueError(\"data has to be a MuData object or a path to MuData-like\")\n        artifact = Artifact(  # type: ignore\n            path=mdata,\n            key=key,\n            run=run,\n            description=description,\n            revises=revises,\n            otype=\"MuData\",\n            kind=\"dataset\",\n            **kwargs,\n        )\n        if not isinstance(mdata, (str, Path, UPath)):\n            artifact.n_observations = mdata.n_obs\n        if schema is not None:\n            from ..curators import MuDataCurator\n\n            curator = MuDataCurator(artifact, schema)\n            curator.validate()\n            artifact.schema = schema\n            artifact._curator = curator\n        return artifact\n\n    @classmethod\n    def from_spatialdata(\n        cls,\n        sdata: SpatialData | AnyPathStr,\n        *,\n        key: str | None = None,\n        description: str | None = None,\n        run: Run | None = None,\n        revises: Artifact | None = None,\n        schema: Schema | None = None,\n        **kwargs,\n    ) -> Artifact:\n        \"\"\"Create from `SpatialData`, optionally validate & annotate.\n\n        Sets `.otype` to `\"SpatialData\"`.\n\n        Args:\n            sdata: A `SpatialData` object.\n            key: A relative path within default storage, e.g., `\"myfolder/myfile.zarr\"`.\n            description: A description.\n            revises: An old version of the artifact.\n            run: The run that creates the artifact.\n            schema: A schema that defines how to validate & annotate.\n\n        See Also:\n            :meth:`~lamindb.Collection`\n                Track collections.\n            :class:`~lamindb.Feature`\n                Track features.\n\n        Example:\n\n            No validation and annotation::\n\n                import lamindb as ln\n\n                artifact = ln.Artifact.from_spatialdata(sdata, key=\"my_dataset.zarr\").save()\n\n            With validation and annotation. First, find a `SpatialData` schema, e.g.::\n\n                ln.Schema.filter(otype=\"SpatialData\").to_dataframe()\n                schema = ln.Schema.get(name=\"spatialdata_blobs_schema\")\n\n            Then, pass the schema to the `from_spatialdata` method::\n\n                artifact = ln.Artifact.from_spatialdata(sdata, key=\"my_dataset.zarr\", schema=schema).save()\n\n            You can also define a schema from scratch:\n\n            .. literalinclude:: scripts/define_schema_spatialdata.py\n                :language: python\n\n        \"\"\"\n        if not data_is_scversedatastructure(sdata, \"SpatialData\"):\n            raise ValueError(\n                \"data has to be a SpatialData object or a path to SpatialData-like\"\n            )\n        artifact = Artifact(  # type: ignore\n            path=sdata,\n            key=key,\n            run=run,\n            description=description,\n            revises=revises,\n            otype=\"SpatialData\",\n            kind=\"dataset\",\n            **kwargs,\n        )\n        # ill-defined https://scverse.zulipchat.com/#narrow/channel/315824-spatial/topic/How.20to.20calculate.20the.20number.20of.20observations.3F\n        # artifact.n_observations = ...\n        if schema is not None:\n            from ..curators import SpatialDataCurator\n\n            curator = SpatialDataCurator(artifact, schema)\n            curator.validate()\n            artifact.schema = schema\n            artifact._curator = curator\n        return artifact\n\n    @classmethod\n    def from_tiledbsoma(\n        cls,\n        exp: SOMAExperiment | AnyPathStr,\n        *,\n        key: str | None = None,\n        description: str | None = None,\n        run: Run | None = None,\n        revises: Artifact | None = None,\n        **kwargs,\n    ) -> Artifact:\n        \"\"\"Create from a `tiledbsoma.Experiment` store.\n\n        Sets `.otype` to `\"tiledbsoma\"` and populates `.n_observations`.\n\n        Args:\n            exp: TileDB-SOMA Experiment object or path to Experiment store.\n            key: A relative path within default storage, e.g., `\"myfolder/mystore.tiledbsoma\"`.\n            description: A description.\n            revises: An old version of the artifact.\n            run: The run that creates the artifact.\n\n        Example::\n\n            import lamindb as ln\n\n            artifact = ln.Artifact.from_tiledbsoma(\"s3://mybucket/store.tiledbsoma\", description=\"a tiledbsoma store\").save()\n        \"\"\"\n        if not data_is_soma_experiment(exp):\n            raise ValueError(\n                \"data has to be a SOMA Experiment object or a path to SOMA Experiment store.\"\n            )\n\n        # SOMAExperiment.uri may have file:// prefix for local paths which needs stripping for filesystem access.\n        # Other URI schemes (s3://, etc.) are preserved and supported.\n        exp = (\n            exp.uri.removeprefix(\"file://\")\n            if not isinstance(exp, (str, Path, UPath))\n            else exp\n        )\n\n        artifact = Artifact(  # type: ignore\n            path=exp,\n            key=key,\n            run=run,\n            description=description,\n            revises=revises,\n            otype=\"tiledbsoma\",\n            kind=\"dataset\",\n            **kwargs,\n        )\n        from ..core.storage._tiledbsoma import _soma_n_observations\n\n        artifact.n_observations = _soma_n_observations(artifact.path)\n        return artifact\n\n    @classmethod\n    def from_dir(\n        cls,\n        path: AnyPathStr,\n        *,\n        key: str | None = None,\n        run: Run | None = None,\n    ) -> SQLRecordList:\n        \"\"\"Create a list of :class:`~lamindb.Artifact` objects from a directory.\n\n        Hint:\n            If you have a high number of files (several 100k) and don't want to\n            track them individually, create a single :class:`~lamindb.Artifact` via\n            ``Artifact(path)`` for them. See, e.g., :doc:`docs:rxrx`.\n\n        Args:\n            path: Source path of folder.\n            key: Key for storage destination.\n                If `None` and directory is in a registered location, the inferred `key` will reflect the relative position.\n                If `None` and directory is outside of a registered storage location, the inferred key defaults to `path.name`.\n            run: A `Run` object.\n\n        Example::\n\n            import lamindb as ln\n\n            dir_path = ln.examples.datasets.dir_scrnaseq_cellranger(\"sample_001\", ln.settings.storage)\n            ln.Artifact.from_dir(dir_path).save()  # creates one artifact per file in dir_path\n        \"\"\"\n        folderpath: UPath = create_path(path)  # returns Path for local\n        storage = settings.storage.record\n        using_key = settings._using_key\n        storage, use_existing_storage = process_pathlike(folderpath, storage, using_key)\n        folder_key_path: PurePath | Path\n        if key is None:\n            if not use_existing_storage:\n                logger.warning(\n                    \"folder is outside existing storage location, will copy files from\"\n                    f\" {path} to {storage.root}/{folderpath.name}\"\n                )\n                folder_key_path = Path(folderpath.name)\n            else:\n                # maintain the hierachy within an existing storage location\n                folder_key_path = get_relative_path_to_directory(\n                    folderpath, UPath(storage.root)\n                )\n        else:\n            folder_key_path = Path(key)\n\n        folder_key = folder_key_path.as_posix()\n        # silence fine-grained logging\n        verbosity = settings.verbosity\n        verbosity_int = settings._verbosity_int\n        if verbosity_int >= 1:\n            settings.verbosity = \"warning\"\n        artifacts_dict = {}\n        for filepath in folderpath.rglob(\"*\"):\n            if filepath.is_file():\n                relative_path = get_relative_path_to_directory(filepath, folderpath)\n                artifact_key = folder_key + \"/\" + relative_path.as_posix()\n                # if creating from rglob, we don't need to check for existence\n                artifact = Artifact(\n                    filepath, run=run, key=artifact_key, skip_check_exists=True\n                )\n                artifacts_dict[artifact.uid] = artifact\n        settings.verbosity = verbosity\n\n        # run sanity check on hashes\n        hashes = [\n            artifact.hash\n            for artifact in artifacts_dict.values()\n            if artifact.hash is not None\n        ]\n        uids = artifacts_dict.keys()\n        n_unique_hashes = len(set(hashes))\n        if n_unique_hashes == len(hashes):\n            artifacts = SQLRecordList(artifacts_dict.values())\n        else:\n            # consider exact duplicates (same id, same hash)\n            # below can't happen anymore because artifacts is a dict now\n            # if len(set(uids)) == len(set(hashes)):\n            #     logger.warning(\"dropping duplicate records in list of artifact records\")\n            #     artifacts = list(set(uids))\n            # consider false duplicates (different id, same hash)\n            if not len(set(uids)) == n_unique_hashes:\n                seen_hashes = set()\n                non_unique_artifacts = {\n                    hash: artifact\n                    for hash, artifact in artifacts_dict.items()\n                    if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash)  # type: ignore\n                }\n                display_non_unique = \"\\n    \".join(\n                    f\"{artifact}\" for artifact in non_unique_artifacts\n                )\n                logger.warning(\n                    \"there are multiple artifact uids with the same hashes, dropping\"\n                    f\" {len(non_unique_artifacts)} duplicates out of\"\n                    f\" {len(artifacts_dict)} artifacts:\\n    {display_non_unique}\"\n                )\n                artifacts = SQLRecordList(\n                    [\n                        artifact\n                        for artifact in artifacts_dict.values()\n                        if artifact not in non_unique_artifacts.values()\n                    ]\n                )\n        logger.success(\n            f\"created {len(artifacts)} artifacts from directory using storage\"\n            f\" {storage.root} and key = {folder_key}/\"\n        )\n        return artifacts\n\n    def replace(\n        self,\n        data: Union[AnyPathStr, pd.DataFrame, AnnData, MuData],\n        run: Run | bool | None = None,\n        format: str | None = None,\n    ) -> None:\n        \"\"\"Replace the artifact content in storage **without** making a new version.\n\n        **Note:** If you want to create a new version, do **not** use the `.replace()` method but rather any `Artifact` constructor.\n\n        Args:\n            data: A file path or in-memory dataset object like a `DataFrame`, `AnnData`, `MuData`, or `SpatialData`.\n            run: `Run | bool | None = None` The run that creates the artifact.\n                If `False`, suppress tracking the run.\n                If `None`, infer the run from the global run context.\n            format: `str | None = None` The format of the data to write into storage.\n                If `None`, infer the format from the data.\n\n        Example:\n\n            Query a text file and replace its content::\n\n                artifact = ln.Artifact.get(key=\"my_file.txt\")\n                artifact.replace(\"./my_new_file.txt\")\n                artifact.save()\n\n            Note that you need to call `.save()` to persist the changes in storage.\n        \"\"\"\n        storage = settings.storage.record\n        run = get_run(run)\n        kwargs, privates = get_artifact_kwargs_from_data(\n            provisional_uid=self.uid,\n            data=data,\n            key=self.key,\n            run=run,\n            format=format,\n            storage=storage,\n            version_tag=None,\n            is_replace=True,\n        )\n\n        # this artifact already exists\n        if isinstance(kwargs, Artifact):\n            return kwargs\n\n        check_path_in_storage = privates[\"check_path_in_storage\"]\n        if check_path_in_storage:\n            err_msg = (\n                \"Can only replace with a local path not in any Storage. \"\n                f\"This data is in {Storage.objects.get(id=kwargs['storage_id'])}.\"\n            )\n            raise ValueError(err_msg)\n\n        _overwrite_versions = kwargs[\"_overwrite_versions\"]\n        if self._overwrite_versions != _overwrite_versions:\n            err_msg = \"It is not allowed to replace \"\n            err_msg += \"a folder\" if self._overwrite_versions else \"a file\"\n            err_msg += \" with \" + (\"a folder.\" if _overwrite_versions else \"a file.\")\n            raise ValueError(err_msg)\n\n        new_suffix = kwargs[\"suffix\"]\n        if new_suffix != self.suffix:\n            key = self.key\n            real_key = self._real_key\n            if key is not None:\n                new_key = PurePosixPath(key).with_suffix(new_suffix).as_posix()\n            else:\n                new_key = None\n            if (key is not None and not self._key_is_virtual) or real_key is not None:\n                # real_key is not None implies key is not None\n                assert key is not None  # noqa: S101\n                if real_key is not None:\n                    self._clear_storagekey = real_key\n                    self._real_key = (\n                        PurePosixPath(real_key).with_suffix(new_suffix).as_posix()\n                    )\n                    warn_msg = f\", _real_key '{real_key}' with '{self._real_key}'\"\n                else:\n                    self._clear_storagekey = key\n                    warn_msg = \"\"\n                self.key = new_key\n                self._original_values[\"key\"] = new_key\n                logger.warning(\n                    f\"replacing the file will replace key '{key}' with '{new_key}'{warn_msg}\"\n                    f\" and delete '{self._clear_storagekey}' upon `save()`\"\n                )\n            else:\n                # purely virtual key case\n                self._clear_storagekey = _s().auto_storage_key_from_artifact(self)\n                # might replace None with None, not a big deal\n                self.key = new_key\n                self._original_values[\"key\"] = new_key\n\n        self.suffix = new_suffix\n        self.size = kwargs[\"size\"]\n        self.hash = kwargs[\"hash\"]\n        self._hash_type = kwargs[\"_hash_type\"]\n        self.run_id = kwargs[\"run_id\"]\n        self.run = kwargs[\"run\"]\n        self.n_files = kwargs[\"n_files\"]\n\n        self._local_filepath = privates[\"local_filepath\"]\n        self._cloud_filepath = privates[\"cloud_filepath\"]\n        self._memory_rep = privates[\"memory_rep\"]\n        # no need to upload if new file is already in storage\n        self._to_store = not check_path_in_storage\n\n        # update old suffix with the new one so that the check in artifact save pass\n        # replace() supports changing the suffix\n        self._original_values[\"suffix\"] = self.suffix\n\n    def open(\n        self,\n        mode: str = \"r\",\n        engine: Literal[\"pyarrow\", \"polars\"] = \"pyarrow\",\n        is_run_input: bool | None = None,\n        **kwargs,\n    ) -> (\n        PyArrowDataset\n        # PolarsLazyFrame does not implement the context manager protocol hence we need `Iterator` in the type annotation\n        | Iterator[\n            PolarsLazyFrame\n        ]  # note that intersphinx doesn't work for this, hence manual docs link: https://github.com/laminlabs/lamindb/issues/2736#issuecomment-3703889524\n        | AnnDataAccessor  # AnnDataAccessor implements the context manager protocol\n        | SpatialDataAccessor\n        | BackedAccessor\n        | SOMACollection\n        | SOMAExperiment\n        | SOMAMeasurement\n    ):\n        \"\"\"Open a dataset for streaming.\n\n        Works for the following object types (storage formats):\n\n        - `DataFrame` (`.parquet`, `.csv`, `.ipc` files or directories with such files)\n        - `AnnData` (`.h5ad`, `.zarr`)\n        - `SpatialData` (`.zarr`)\n        - `tiledbsoma` (`.tiledbsoma`)\n        - generic arrays (`.h5`, `.zarr`)\n\n        Args:\n            mode: can be `\"r\"` or `\"w\"` (write mode) for `tiledbsoma` stores,\n                `\"r\"` or `\"r+\"` for `AnnData` or `SpatialData` `zarr` stores,\n                otherwise should be always `\"r\"` (read-only mode).\n            engine: Which module to use for lazy loading of a dataframe\n                from `pyarrow` or `polars` compatible formats.\n                This has no effect if the artifact is not a dataframe, i.e.\n                if it is an `AnnData,` `hdf5`, `zarr`, `tiledbsoma` object etc.\n            is_run_input: Whether to track this artifact as run input.\n            **kwargs: Keyword arguments for the accessor, i.e. `h5py` or `zarr` connection,\n                `pyarrow.dataset.dataset`, `polars.scan_*` function.\n\n        Returns:\n            Streaming accessors, in particular,\n            a :class:`pyarrow:pyarrow.dataset.Dataset` object,\n            a context manager yielding a `polars.LazyFrame <https://docs.pola.rs/api/python/stable/reference/lazyframe/>`__,\n            and objects of type :class:`~lamindb.core.storage.AnnDataAccessor`, :class:`~lamindb.core.storage.SpatialDataAccessor`, :class:`~lamindb.core.storage.BackedAccessor`,\n            :class:`tiledbsoma:tiledbsoma.Collection`, :class:`tiledbsoma.Experiment`, :class:`tiledbsoma.Measurement`.\n\n        Note:\n            For TileDB-SOMA stores on S3 with federated credentials,\n            credentials are updated only when the storage is opened, not while the\n            store handle is held open. If credentials expire during a long-lived\n            session, close the store and open it again to refresh.\n\n        Examples:\n\n            Open a `DataFrame`-like artifact via :class:`pyarrow:pyarrow.dataset.Dataset`::\n\n                artifact = ln.Artifact.get(key=\"sequences/mydataset.parquet\")\n                artifact.open()\n                #> pyarrow._dataset.FileSystemDataset\n\n            Open a `DataFrame`-like artifact via `polars.LazyFrame <https://docs.pola.rs/api/python/stable/reference/lazyframe/>`__::\n\n                artifact = ln.Artifact.get(key=\"sequences/mydataset.parquet\")\n                with artifact.open(engine=\"polars\") as df:\n                    # use the `polars.LazyFrame` object similar to a `DataFrame` object\n\n            Open an `AnnData`-like artifact via :class:`~lamindb.core.storage.AnnDataAccessor`::\n\n                import lamindb as ln\n\n                artifact = ln.Artifact.get(key=\"scrna/mydataset.h5ad\")\n                with artifact.open() as adata:\n                    # use the `AnnDataAccessor` similar to an `AnnData` object\n\n            For more examples and background, see guide: :doc:`/arrays`.\n\n        \"\"\"\n        from ..core.storage._backed_access import _track_writes_factory, backed_access\n        from ..core.storage._polars_lazy_df import POLARS_SUFFIXES\n        from ..core.storage._pyarrow_dataset import PYARROW_SUFFIXES\n\n        if self._overwrite_versions and not self.is_latest:\n            raise ValueError(OUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG)\n        # all hdf5 suffixes including gzipped\n        h5_suffixes = [\".h5\", \".hdf5\", \".h5ad\"]\n        h5_gz_suffixes = []\n        for s in h5_suffixes:\n            h5_gz_suffixes += [s, s + \".gz\", s + \".tar.gz\"]\n        # ignore empty suffix for now\n        df_suffixes = tuple(set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES))\n        suffixes = (\n            (\n                \"\",\n                \".zarr\",\n                \".anndata.zarr\",\n                \".tiledbsoma\",\n            )\n            + tuple(h5_gz_suffixes)\n            + df_suffixes\n        )\n        suffix = self.suffix\n        if suffix not in suffixes:\n            raise ValueError(\n                \"Artifact should have a zarr, h5, tiledbsoma object\"\n                \" or a compatible `pyarrow.dataset.dataset` or `polars.scan_*` directory\"\n                \" as the underlying data, please use one of the following suffixes\"\n                f\" for the object name: {', '.join(suffixes[1:])}.\"\n                f\" Or no suffix for a folder with {', '.join(df_suffixes)} files\"\n                \" (no mixing allowed).\"\n            )\n        using_key = settings._using_key\n        filepath, cache_key = _s().filepath_cache_key_from_artifact(\n            self, using_key=using_key\n        )\n\n        is_tiledbsoma_w = (\n            filepath.name == \"soma\" or suffix == \".tiledbsoma\"\n        ) and mode == \"w\"\n        is_zarr_w = suffix == \".zarr\" and mode == \"r+\"\n\n        if mode != \"r\":\n            if not (is_tiledbsoma_w or is_zarr_w):\n                raise ValueError(\n                    f\"It is not allowed to open a {suffix} object with `mode='{mode}'`. \"\n                    \"You can open all supported formats with `mode='r'`, \"\n                    \"a tiledbsoma store with `mode='w'`, \"\n                    \"AnnData or SpatialData zarr store with `mode='r+'`.\"\n                )\n            elif not self.overwrite_versions:\n                raise ValueError(\n                    \"It is not possible to open artifacts having `overwrite_versions=False` \"\n                    \"in non-read mode (other than `mode='r'`).\"\n                )\n        # consider the case where an object is already locally cached\n        localpath = setup_settings.paths.cloud_to_local_no_update(\n            filepath, cache_key=cache_key\n        )\n        if is_tiledbsoma_w or is_zarr_w:\n            open_cache = False\n        else:\n            open_cache = not isinstance(\n                filepath, LocalPathClasses\n            ) and not filepath.synchronize_to(localpath, just_check=True)\n        if open_cache:\n            try:\n                access = backed_access(\n                    localpath, mode, engine, using_key=using_key, **kwargs\n                )\n            except Exception as e:\n                # also ignore ValueError here because\n                # such errors most probably just imply an incorrect argument\n                if isinstance(e, (ImportError, ValueError)) or isinstance(\n                    filepath, LocalPathClasses\n                ):\n                    raise e\n                logger.warning(\n                    f\"The cache might be corrupted: {e}. Trying to open directly.\"\n                )\n                access = backed_access(\n                    filepath, mode, engine, using_key=using_key, **kwargs\n                )\n                # happens only if backed_access has been successful\n                # delete the corrupted cache\n                if localpath.is_dir():\n                    shutil.rmtree(localpath)\n                else:\n                    localpath.unlink(missing_ok=True)\n        else:\n            access = backed_access(self, mode, engine, using_key=using_key, **kwargs)\n            if is_tiledbsoma_w:\n\n                def finalize():\n                    nonlocal self, filepath, localpath\n                    if not isinstance(filepath, LocalPathClasses):\n                        _, hash, _, _ = get_stat_dir_cloud(filepath)\n                    else:\n                        # this can be very slow\n                        _, hash, _, _ = hash_dir(filepath)\n                    if self.hash != hash:\n                        from .sqlrecord import init_self_from_db\n\n                        new_version = Artifact(\n                            filepath, revises=self, _is_internal_call=True\n                        ).save()\n                        # note: sets _state.db = \"default\"\n                        init_self_from_db(self, new_version)\n\n                        if localpath != filepath and localpath.exists():\n                            shutil.rmtree(localpath)\n\n                access = _track_writes_factory(access, finalize)\n        # only call if open is successfull\n        track_run_input(self, is_run_input)\n        return access\n\n    def load(\n        self, *, is_run_input: bool | None = None, mute: bool = False, **kwargs\n    ) -> (\n        pd.DataFrame\n        | ScverseDataStructures\n        | dict[str, Any]\n        | list[Any]\n        | AnyPathStr\n        | None\n    ):\n        \"\"\"Cache artifact in local cache and then load it into memory.\n\n        See: :mod:`~lamindb.core.loaders`.\n\n        Args:\n            is_run_input: Whether to track this artifact as run input.\n            mute: Silence logging of caching progress.\n            **kwargs: Keyword arguments for the loader.\n\n        Examples:\n\n            Load a `DataFrame`-like artifact::\n\n                df = artifact.load()\n\n            Load an `AnnData`-like artifact::\n\n                adata = artifact.load()\n        \"\"\"\n        from ..core.loaders import load_to_memory\n\n        if self._overwrite_versions and not self.is_latest:\n            raise ValueError(OUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG)\n\n        if hasattr(self, \"_memory_rep\") and self._memory_rep is not None:\n            access_memory = self._memory_rep\n            # SpatialData objects zarr stores are moved when saved\n            # SpatialData's __repr__ method attempts to access information from the old path\n            # Therefore, we need to update the in-memory path to the now moved Artifact storage path\n            if access_memory.__class__.__name__ == \"SpatialData\":\n                access_memory.path = self._cache_path\n        else:\n            filepath, cache_key = _s().filepath_cache_key_from_artifact(\n                self, using_key=settings._using_key\n            )\n            cache_path = _synchronize_cleanup_on_error(\n                filepath, cache_key=cache_key, print_progress=not mute\n            )\n            try:\n                # cache_path is local so doesn't trigger any sync in load_to_memory\n                access_memory = load_to_memory(cache_path, **kwargs)\n            except Exception as e:\n                # raise the exception if it comes from not having a correct loader\n                # import error is also most probbaly not a problem with the cache\n                # or if the original path is local\n                if isinstance(e, (NotImplementedError, ImportError)) or isinstance(\n                    filepath, LocalPathClasses\n                ):\n                    raise e\n                logger.warning(\n                    f\"The cache might be corrupted: {e}. Retrying to synchronize.\"\n                )\n                # delete the existing cache\n                if cache_path.is_dir():\n                    shutil.rmtree(cache_path)\n                else:\n                    cache_path.unlink(missing_ok=True)\n                # download again and try to load into memory\n                cache_path = _synchronize_cleanup_on_error(\n                    filepath, cache_key=cache_key, print_progress=not mute\n                )\n                access_memory = load_to_memory(cache_path, **kwargs)\n        # only call if load is successfull\n        track_run_input(self, is_run_input)\n\n        return access_memory\n\n    def cache(\n        self, *, is_run_input: bool | None = None, mute: bool = False, **kwargs\n    ) -> UPath:\n        \"\"\"Download cloud artifact to local cache.\n\n        Follows synching logic: only caches an artifact if it's outdated in the local cache.\n\n        Returns a path to a locally cached on-disk object (say a `.jpg` file).\n\n        Args:\n            mute: Silence logging of caching progress.\n            is_run_input: Whether to track this artifact as run input.\n\n        Example:\n\n            Sync the artifact from the cloud and return the local path to the cached file::\n\n                artifact.cache()\n                #> PosixPath('/home/runner/work/Caches/lamindb/lamindata/pbmc68k.h5ad')\n        \"\"\"\n        if self._overwrite_versions and not self.is_latest:\n            raise ValueError(OUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG)\n\n        filepath, cache_key = _s().filepath_cache_key_from_artifact(\n            self, using_key=settings._using_key\n        )\n        if mute:\n            kwargs[\"print_progress\"] = False\n        cache_path = _synchronize_cleanup_on_error(\n            filepath, cache_key=cache_key, **kwargs\n        )\n        # only call if sync is successfull\n        track_run_input(self, is_run_input)\n        return cache_path\n\n    def delete(\n        self,\n        permanent: bool | None = None,\n        storage: bool | None = None,\n        using_key: str | None = None,\n    ) -> None:\n        \"\"\"Trash or permanently delete.\n\n        A first call to `.delete()` puts an artifact into the trash (sets `branch_id` to `-1`).\n        A second call permanently deletes the artifact.\n\n        For an `artifact` that has multiple versions and for which `artifact.overwrite_versions is True`, the default behavior for folders,\n        deleting a non-latest version will not delete the underlying storage unless `storage=True` is passed.\n        Deleting the latest version will delete all versions.\n\n        Args:\n            permanent: Permanently delete the artifact (skip trash).\n            storage: Indicate whether you want to delete the artifact in storage.\n\n        Examples:\n\n            Delete a single file artifact::\n\n                import lamindb as ln\n\n                artifact = ln.Artifact.get(key=\"some.csv\")\n                artifact.delete() # delete a single file artifact\n\n            Delete an old version of a folder-like artifact::\n\n                artifact = ln.Artifact.filter(key=\"folder.zarr\", is_latest=False).first()\n                artiact.delete() # delete an old version, the data will not be deleted\n\n            Delete all versions of a folder-like artifact::\n\n                artifact = ln.Artifact.get(key=\"folder.zarr\". is_latest=True)\n                artifact.delete() # delete all versions, the data will be deleted or prompted for deletion.\n        \"\"\"\n        super().delete(permanent=permanent, storage=storage, using_key=using_key)\n\n    # TODO: consider renaming the transfer argument to sync\n    def save(\n        self,\n        upload: bool | None = None,\n        transfer: Literal[\"record\", \"annotations\"] = \"record\",\n        **kwargs,\n    ) -> Artifact:\n        \"\"\"Save to database & storage.\n\n        Args:\n            upload: Trigger upload to cloud storage in instances with hybrid storage mode.\n            transfer: In case artifact was queried on a different instance, dictates behavior of sync.\n                If \"record\", only the artifact record is synced to the current instance.\n                If \"annotations\", also the annotations linked in the source instance are synced.\n\n        See Also:\n            :doc:`sync`\n\n        Example:\n\n            Save a file-like artifact after creating it with the default constructor `Artifact()`::\n\n                import lamindb as ln\n\n                artifact = ln.Artifact(\"./myfile.csv\", key=\"myfile.parquet\").save()\n        \"\"\"\n        if (\n            not self._state.adding\n            # skip on is_latest change\n            # no need to check if saved because it is checked above\n            and not self._field_changed(\"is_latest\", check_is_saved=False)\n            and not self.is_latest\n            and self.branch_id != -1  # skip on soft deletion\n        ):\n            logger.warning(\"you are saving to a non-latest version of the artifact\")\n\n        access_token = kwargs.pop(\"access_token\", None)\n\n        current_instance_uid = setup_settings.instance.uid\n\n        artifact_storage = self.storage\n        artifact_storage_instance_uid = artifact_storage.instance_uid\n        is_not_artifact_storage_managed_by_current_instance = (\n            artifact_storage_instance_uid != current_instance_uid\n        )\n\n        if self._field_changed(\"key\", check_is_saved=False):\n            new_key = self.key\n            if new_key is None:\n                raise InvalidArgument(\"Cannot update an artifact key to None.\")\n            new_key_suffix = extract_suffix_from_path(\n                PurePosixPath(new_key), arg_name=\"key\"\n            )\n            if new_key_suffix != self.suffix:\n                raise InvalidArgument(\n                    f\"The suffix '{new_key_suffix}' of the provided key is incorrect, it should be '{self.suffix}'.\"\n                )\n            # Virtual key updates are metadata-only because physical storage keys are\n            # uid-based.\n            if self._key_is_virtual:\n                self._original_values[\"key\"] = new_key\n            else:\n                if self._state.adding:\n                    raise InvalidArgument(\n                        \"Cannot update the key of an artifact before it is saved.\"\n                    )\n                if is_not_artifact_storage_managed_by_current_instance:\n                    raise InvalidArgument(\n                        \"Cannot update a non-virtual key of an artifact\"\n                        \" in a storage location that is not managed by the current instance.\"\n                    )\n                old_key = self._original_values[\"key\"]\n                if old_key is None:\n                    raise InvalidArgument(\n                        \"Cannot update a non-virtual artifact key from None.\"\n                    )\n                if not _handle_non_virtual_key_change_on_save(\n                    self, old_key=old_key, new_key=new_key\n                ):\n                    return None\n\n        if self._field_changed(\"suffix\", check_is_saved=False):\n            if self._state.adding:\n                raise InvalidArgument(\n                    \"Cannot update the suffix of an artifact before it is saved.\"\n                )\n            if is_not_artifact_storage_managed_by_current_instance:\n                raise InvalidArgument(\n                    \"Cannot update the suffix of an artifact\"\n                    \" in a storage location that is not managed by the current instance.\"\n                )\n            if not _handle_suffix_change_on_save(self):\n                return None\n\n        # when space is passed in init, storage is ignored, so space - storage consistency is enforced there\n        if (\n            self._field_changed(\"space_id\")\n            # here we check for storages managed by any instance\n            # not necessarily with managed credentials\n            # we check if the artifact storage is managed by the current instance further\n            and artifact_storage_instance_uid is not None\n        ):\n            if is_not_artifact_storage_managed_by_current_instance:\n                raise ValueError(\n                    \"Cannot change the space of an artifact\"\n                    \" in a storage location that is not managed by the current instance.\"\n                )\n            space = self.space\n            storage_type = artifact_storage.type\n            storages = Storage.connect(self._state.db).filter(\n                space=space, instance_uid=current_instance_uid, type=storage_type\n            )\n            n_storages = storages.count()\n            if n_storages == 0:\n                raise ValueError(\n                    f\"No {storage_type} storage locations managed by the current instance found for the space '{space.name}'.\"\n                )\n            elif n_storages > 1:\n                storages = storages.order_by(\"id\")\n                roots_str = \"\\n\".join(\n                    f\"{i}: {storage.root}\" for i, storage in enumerate(storages)\n                )\n                choice = input(\n                    f\"Select a storage location of type '{storage_type}' from the target space '{space.name}':\"\n                    f\" \\n{roots_str}\\n\"\n                    \"Enter the number or 'x' to cancel: \"\n                )\n                if choice == \"x\":\n                    logger.warning(\"saving was cancelled\")\n                    return None\n                storage = storages[int(choice)]\n            else:\n                storage = storages.one()\n            if artifact_storage != storage:\n                # try to transfer if both storages are writable / managed by an instance\n                # replaces artifact.storage with the new storage if successful\n                _move_artifact_to_storage(self, storage, access_token=access_token)\n            else:\n                logger.important(\"artifact is already in the target storage location\")\n            # Keep tracked values in sync after handling a space update so\n            # repeated saves don't keep re-running this branch.\n            self._original_values[\"space_id\"] = self.space_id\n\n        if transfer not in {\"record\", \"annotations\"}:\n            raise ValueError(\n                f\"transfer should be either 'record' or 'annotations', not {transfer}\"\n            )\n        else:\n            kwargs[\"transfer\"] = transfer\n        state_was_adding = self._state.adding\n        print_progress = kwargs.pop(\"print_progress\", True)\n        store_kwargs = kwargs.pop(\n            \"store_kwargs\", {}\n        )  # kwargs for .upload_from in the end\n        local_path = None\n        if upload and setup_settings.instance.keep_artifacts_local:\n            # switch local storage location to cloud\n            local_path = self.path\n            self.storage_id = setup_settings.instance.storage._id\n            self._local_filepath = local_path\n            # switch to virtual storage key upon upload\n            # the local filepath is already cached at that point\n            self._key_is_virtual = True\n            # ensure that the artifact is uploaded\n            self._to_store = True\n\n        local_filepath = getattr(self, \"_local_filepath\", None)\n        has_local_filepath = local_filepath is not None\n        if has_local_filepath and not local_filepath.exists():\n            raise FileNotFoundError(\n                f\"Unable to save the artifact because the local path {local_filepath} does not exist.\"\n            )\n\n        flag_complete = has_local_filepath and getattr(self, \"_to_store\", False)\n        if flag_complete:\n            if is_not_artifact_storage_managed_by_current_instance:\n                raise ValueError(\n                    \"Cannot save an artifact to a storage location that is not managed by the current instance.\"\n                )\n            # _storage_ongoing indicates whether the storage saving / upload process is ongoing\n            self._storage_ongoing = True  # will be updated to False once complete\n\n        self._save_skip_storage(**kwargs)\n\n        using_key = None\n        if \"using\" in kwargs:\n            using_key = kwargs[\"using\"]\n        exception_upload = check_and_attempt_upload(\n            self,\n            using_key,\n            access_token=access_token,\n            print_progress=print_progress,\n            **store_kwargs,\n        )\n        if exception_upload is not None:\n            # we do not want to raise file not found on cleanup if upload of a file failed\n            # often it is ACID in the filesystem itself\n            # for example, s3 won't have the failed file, so just skip the delete in this case\n            raise_file_not_found_error = False\n            self._delete_skip_storage()\n        else:\n            # this is the case when it is cleaned on .replace\n            raise_file_not_found_error = True\n        # this is triggered by an exception in check_and_attempt_upload or by replace.\n        exception_clear = check_and_attempt_clearing(\n            self,\n            raise_file_not_found_error=raise_file_not_found_error,\n            using_key=using_key,\n        )\n        if exception_upload is not None:\n            raise exception_upload\n        if exception_clear is not None:\n            raise exception_clear\n        # the saving / upload process has been successful\n        if flag_complete:\n            self._storage_ongoing = False\n            # pass kwargs below because it can contain `using` or other things\n            # affecting the connection\n            super().save(**kwargs)\n\n        # this is only for keep_artifacts_local\n        if local_path is not None and not state_was_adding:\n            # only move the local artifact to cache if it was not newly created\n            local_path_cache = ln_setup.settings.cache_dir / local_path.name\n            # don't use Path.rename here because of cross-device link error\n            # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969\n            shutil.move(\n                local_path,  # type: ignore\n                local_path_cache,\n            )\n            logger.important(f\"moved local artifact to cache: {local_path_cache}\")\n\n        # annotate with external features\n        if hasattr(self, \"_external_features\"):\n            external_features = self._external_features\n            self.features.set_values(external_features)\n        # annotate with internal features based on curator\n        if hasattr(self, \"_curator\"):\n            curator = self._curator\n            del self._curator\n            # just annotates this artifact\n            curator.save_artifact()\n        if hasattr(self, \"_external_features\"):\n            del self._external_features\n        if hasattr(self, \"_local_filepath\"):\n            del self._local_filepath\n        return self\n\n\ndef _update_artifact_keys_with_suffix(artifact: Artifact, suffix: str):\n    key = artifact.key\n    real_key = artifact._real_key\n    if key is not None:\n        new_key = PurePosixPath(key).with_suffix(suffix).as_posix()\n        artifact.key = new_key\n    if real_key is not None:\n        artifact._real_key = PurePosixPath(real_key).with_suffix(suffix).as_posix()\n\n\ndef _confirm_artifact_move(source_path_str: str, target_path_str: str) -> bool:\n    # ask for confirmation\n    # TODO: add a way to disable confirmation\n    response = input(\n        f\"You are about to move artifact from '{source_path_str}' to '{target_path_str}'.\\n\"\n        \"Continue? (y/n) \"\n    )\n    if response != \"y\":\n        logger.warning(\"saving was cancelled\")\n        return False\n    return True\n\n\ndef _handle_non_virtual_key_change_on_save(\n    artifact: Artifact, *, old_key: str, new_key: str\n) -> bool:\n    # _real_key should actually be None here because it goes with virtual key\n    source_storage_key = (\n        artifact._real_key if artifact._real_key is not None else old_key\n    )\n    source_path = artifact.storage.path / source_storage_key\n    # key was updated, so artifact.path is the new path\n    target_path_str = artifact.path.as_posix()\n    source_path_str = source_path.as_posix()\n    if not _confirm_artifact_move(source_path_str, target_path_str):\n        return False\n    _safe_move(source_path.fs, source_path_str, target_path_str)\n    if artifact._real_key is not None:\n        artifact._real_key = new_key\n    # Keep tracked values in sync so repeated saves don't trigger another move.\n    artifact._original_values[\"key\"] = new_key\n    # If key change already applied the suffix transition, skip suffix handling below.\n    artifact._original_values[\"suffix\"] = artifact.suffix\n    return True\n\n\ndef _handle_suffix_change_on_save(artifact: Artifact) -> bool:\n    suffix = artifact.suffix\n    # depends on whether key is virtual or real key is present\n    source_or_target_path = artifact.path\n    source_path_str = source_or_target_path.with_suffix(\n        artifact._original_values[\"suffix\"]\n    ).as_posix()\n    target_path_str = source_or_target_path.with_suffix(suffix).as_posix()\n    if not _confirm_artifact_move(source_path_str, target_path_str):\n        return False\n    # source_path and target_path are on the same filesystem\n    _safe_move(source_or_target_path.fs, source_path_str, target_path_str)\n    _update_artifact_keys_with_suffix(artifact, suffix)\n    # Keep tracked values in sync so consecutive suffix updates on the same\n    # in-memory instance trigger a move each time.\n    artifact._original_values[\"suffix\"] = suffix\n    artifact._original_values[\"key\"] = artifact.key\n    return True\n\n\ndef _sorted_sizes(fs: AbstractFileSystem, path: str) -> list[int]:\n    objects = fs.find(path, detail=True)\n    return sorted(info[\"size\"] for info in objects.values())\n\n\ndef _rm_catch_error(fs: AbstractFileSystem, path: str) -> Exception | None:\n    if fs.exists(path):\n        try:\n            fs.rm(path, recursive=True)\n        except Exception as rm_exc:\n            return rm_exc\n    return None\n\n\ndef _safe_move(fs: AbstractFileSystem, source: str, target: str):\n    if fs.exists(target):\n        raise FileExistsError(\n            f\"Cannot move artifact to '{target}' because it already exists.\"\n        )\n    logger.important(f\"moving artifact from '{source}' to '{target}'\")\n    try:\n        fs.copy(source, target, recursive=True)\n    except Exception as e:\n        message = \"Failed to copy artifact to target storage during transfer.\"\n        cleanup_error = _rm_catch_error(fs, target)\n        if cleanup_error is not None:\n            message += f\" Cleanup of copied target also failed: {cleanup_error}\"\n        raise RuntimeError(message) from e\n    # check that the sizes of the files are the same\n    if _sorted_sizes(fs, source) != _sorted_sizes(fs, target):\n        message = \"Move verification failed: copied artifact does not match source.\"\n        cleanup_error = _rm_catch_error(fs, target)\n        if cleanup_error is not None:\n            message += \" Cleanup of copied target also failed.\"\n        raise RuntimeError(message) from cleanup_error\n\n    try:\n        fs.rm(source, recursive=True)\n    except Exception as e:\n        logger.error(\n            f\"copying to '{target}' succeeded but failed to remove source '{source}': {e}\"\n        )\n\n\ndef _move_artifact_to_storage(\n    artifact: Artifact, storage: Storage, access_token: str | None = None\n):\n    storage_key = _s().auto_storage_key_from_artifact(artifact)\n\n    source_path = artifact.path\n    target_path = storage.path / storage_key\n    if source_path == target_path:\n        raise ValueError(\"Cannot move to the same path.\")\n\n    fs = fs_for_moving(source_path, target_path, access_token=access_token)\n\n    source_path_str = str(source_path)\n    target_path_str = str(target_path)\n\n    _safe_move(fs, source_path_str, target_path_str)\n\n    artifact.storage_id = storage.id\n\n\n# can't really just call .cache in .load because of double tracking\ndef _synchronize_cleanup_on_error(\n    filepath: UPath, cache_key: str | None = None, **kwargs\n) -> UPath:\n    try:\n        print_progress = kwargs.pop(\"print_progress\", True)\n        cache_path = setup_settings.paths.cloud_to_local(\n            filepath, cache_key=cache_key, print_progress=print_progress, **kwargs\n        )\n    except Exception as e:\n        if not isinstance(filepath, LocalPathClasses):\n            cache_path = setup_settings.paths.cloud_to_local_no_update(\n                filepath, cache_key=cache_key\n            )\n            if cache_path.is_dir():\n                shutil.rmtree(cache_path)\n            else:\n                cache_path.unlink(missing_ok=True)\n        raise e\n    return cache_path\n\n\ndef _delete_skip_storage(artifact, *args, **kwargs) -> None:\n    super(SQLRecord, artifact).delete(*args, **kwargs)\n\n\ndef _save_skip_storage(artifact, **kwargs) -> None:\n    save_staged_schemas(artifact)\n    super(Artifact, artifact).save(**kwargs)\n    save_schema_links(artifact)\n\n\nclass ArtifactJsonValue(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name=\"links_jsonvalue\")\n    # we follow the lower() case convention rather than snake case for link models\n    jsonvalue: JsonValue = ForeignKey(JsonValue, PROTECT, related_name=\"links_artifact\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"artifact\", \"jsonvalue\")\n\n\nclass ArtifactUser(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    artifact: Artifact = ForeignKey(\"Artifact\", CASCADE, related_name=\"links_user\")\n    user: User = ForeignKey(User, PROTECT, related_name=\"links_artifact\")\n    feature: Feature | None = ForeignKey(\n        Feature, PROTECT, null=True, related_name=\"links_artifactuser\", default=None\n    )\n\n    class Meta:\n        # can have the same label linked to the same artifact if the feature is\n        # different\n        app_label = \"lamindb\"\n        unique_together = (\"artifact\", \"user\", \"feature\")\n\n\nclass ArtifactRun(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    artifact: Artifact = ForeignKey(\"Artifact\", CASCADE, related_name=\"links_run\")\n    # consciously choosing CASCADE\n    run: Run = ForeignKey(Run, CASCADE, related_name=\"links_artifact\")\n    feature: Feature | None = ForeignKey(\n        Feature, PROTECT, null=True, related_name=\"links_artifactrun\", default=None\n    )\n\n    class Meta:\n        # can have the same label linked to the same artifact if the feature is\n        # different\n        app_label = \"lamindb\"\n        unique_together = (\"artifact\", \"run\", \"feature\")\n\n\nclass ArtifactArtifact(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    artifact: Artifact = ForeignKey(\"Artifact\", CASCADE, related_name=\"links_artifact\")\n    # consciously choosing CASCADE\n    value: Artifact = ForeignKey(\"Artifact\", CASCADE, related_name=\"links_value\")\n    feature: Feature | None = ForeignKey(\n        Feature, PROTECT, null=True, related_name=\"links_artifactartifact\", default=None\n    )\n\n    class Meta:\n        # can have the same label linked to the same artifact if the feature is\n        # different\n        app_label = \"lamindb\"\n        unique_together = (\"artifact\", \"value\", \"feature\")\n\n\ndef track_run_input(\n    record: (\n        Artifact | Iterable[Artifact]\n    ),  # can also be Collection | Iterable[Collection]\n    is_run_input: bool | Run | None = None,\n    run: Run | None = None,\n) -> None:\n    \"\"\"Links a record as an input to a run.\n\n    This function contains all validation logic to make decisions on whether a\n    record qualifies as an input or not.\n    \"\"\"\n    if is_run_input is False:\n        return None\n\n    from ..core._context import context\n    from ..core._functions import get_current_tracked_run\n    from .collection import Collection\n\n    if isinstance(is_run_input, Run):\n        run = is_run_input\n        is_run_input = True\n    elif run is None:\n        run = get_current_tracked_run()\n        if run is None:\n            run = context.run\n    # consider that record is an iterable of Data\n    record_iter: Iterable[Artifact] | Iterable[Collection] = (\n        [record] if isinstance(record, (Artifact, Collection)) else record\n    )\n    input_records = []\n    if run is not None:\n        assert not run._state.adding, \"Save the run before tracking its inputs.\"  # noqa: S101\n\n        def is_valid_input(record: Artifact | Collection):\n            is_valid = False\n            # if a record is not yet saved it has record._state.db = None\n            # then it can't be an input\n            # we silently ignore because what will happen is that\n            # the record either gets saved and then is tracked as an output\n            # or it won't get saved at all\n            if record._state.db == \"default\":\n                # things are OK if the record is on the default db\n                is_valid = True\n            else:\n                # record is on another db\n                # we have to save the record into the current db with\n                # the run being attached to a transfer transform\n                logger.info(\n                    f\"completing transfer to track {record.__class__.__name__}('{record.uid}') as input\"\n                )\n                record.save()\n                is_valid = True\n            # avoid cycles: record can't be both input and output\n            if record.run_id == run.id:\n                logger.debug(\n                    f\"not tracking {record} as input to run {run} because created by same run\"\n                )\n                is_valid = False\n            if run.id == getattr(record, \"_subsequent_run_id\", None):\n                logger.debug(\n                    f\"not tracking {record} as input to run {run} because re-created in same run\"\n                )\n                is_valid = False\n            return is_valid\n\n        input_records = [record for record in record_iter if is_valid_input(record)]\n        input_records_ids = [record.id for record in input_records]\n    if input_records:\n        record_class_name = input_records[0].__class__.__name__.lower()\n    # let us first look at the case in which the user does not\n    # provide a boolean value for `is_run_input`\n    # hence, we need to determine whether we actually want to\n    # track a run or not\n    track = False\n    is_run_input = settings.track_run_inputs if is_run_input is None else is_run_input\n    if is_run_input:\n        if run is None:\n            isettings = setup_settings.instance\n            if not (isettings._is_clone or isettings.is_read_only_connection):\n                logger.warning(WARNING_NO_INPUT)\n        elif input_records:\n            logger.debug(\n                f\"adding {record_class_name} ids {input_records_ids} as inputs for run {run.id}\"\n            )\n            track = True\n    else:\n        track = is_run_input\n    if not track or not input_records:\n        return None\n    if run is None:\n        raise ValueError(\"No run context set. Call `ln.track()`.\")\n    if record_class_name == \"artifact\":\n        IsLink = run.input_artifacts.through\n        links = [\n            IsLink(run_id=run.id, artifact_id=record_id)\n            for record_id in input_records_ids\n        ]\n    else:\n        IsLink = run.input_collections.through\n        links = [\n            IsLink(run_id=run.id, collection_id=record_id)\n            for record_id in input_records_ids\n        ]\n    try:\n        IsLink.objects.bulk_create(links, ignore_conflicts=True)\n    except ProgrammingError as e:\n        if \"new row violates row-level security policy\" in str(e):\n            instance = setup_settings.instance\n            available_spaces = instance.available_spaces\n            if available_spaces is None:\n                raise NoWriteAccess(\n                    f\"You’re not allowed to write to the instance {instance.slug}.\\n\"\n                    \"Please contact administrators of the instance if you need write access.\"\n                ) from None\n            write_access_spaces = available_spaces[\"admin\"] + available_spaces[\"write\"]\n            no_write_access_spaces = {\n                record_space\n                for record in input_records\n                if (record_space := record.space) not in write_access_spaces\n            }\n            if (run_space := run.space) not in write_access_spaces:\n                no_write_access_spaces.add(run_space)\n\n            if not no_write_access_spaces:\n                # if there are no unavailable spaces, then this should be due to locking\n                locked_records = [\n                    record\n                    for record in input_records\n                    if getattr(record, \"is_locked\", False)\n                ]\n                if run.is_locked:\n                    locked_records.append(run)\n                # if no unavailable spaces and no locked records, just raise the original error\n                if not locked_records:\n                    raise e\n                no_write_msg = (\n                    \"It is not allowed to modify locked records: \"\n                    + \", \".join(\n                        r.__class__.__name__ + f\"(uid={r.uid})\" for r in locked_records\n                    )\n                    + \".\"\n                )\n                raise NoWriteAccess(no_write_msg) from None\n\n            if len(no_write_access_spaces) > 1:\n                name_msg = \", \".join(\n                    f\"'{space.name}'\" for space in no_write_access_spaces\n                )\n                space_msg = \"spaces\"\n            else:\n                name_msg = f\"'{no_write_access_spaces.pop().name}'\"\n                space_msg = \"space\"\n            raise NoWriteAccess(\n                f\"You’re not allowed to write to the {space_msg} {name_msg}.\\n\"\n                f\"Please contact administrators of the {space_msg} if you need write access.\"\n            ) from None\n        else:\n            raise e\n\n\n# privates currently dealt with separately\n# mypy: ignore-errors\nArtifact._delete_skip_storage = _delete_skip_storage\nArtifact._save_skip_storage = _save_skip_storage\nArtifact.view_lineage = view_lineage\n\n\n# PostgreSQL migration helper for _save_completed to _aux[\"storage_completed\"]\n\n\ndef migrate_save_completed_to_aux_postgres(schema_editor) -> None:\n    \"\"\"Migrate _save_completed field to _aux['storage_completed'] using PostgreSQL raw SQL.\n\n    This migrates _save_completed=False into _aux['storage_completed']=false.\n    _save_completed=True results in no change to _aux (empty JSON is the default).\n    \"\"\"\n    schema_editor.execute(\"\"\"\n        UPDATE lamindb_artifact\n        SET _aux = CASE\n                WHEN _save_completed = FALSE THEN\n                    CASE\n                        WHEN _aux IS NULL THEN\n                            jsonb_build_object('storage_completed', false)\n                        ELSE\n                            _aux || jsonb_build_object('storage_completed', false)\n                    END\n                ELSE _aux\n            END,\n            _save_completed = NULL\n        WHERE _save_completed IS NOT NULL\n    \"\"\")\n"
  },
  {
    "path": "lamindb/models/artifact_set.py",
    "content": "from __future__ import annotations\n\nfrom collections.abc import Iterable, Iterator\nfrom typing import TYPE_CHECKING, Literal\n\nfrom django.db.models import Case, Q, TextField, Value, When\nfrom django.db.models.functions import Concat\nfrom lamin_utils import logger\nfrom lamindb_setup.core._docs import doc_args\nfrom upath import UPath\n\nfrom .artifact import Artifact, track_run_input\nfrom .collection import Collection, _load_concat_artifacts\n\nif TYPE_CHECKING:\n    from anndata import AnnData\n    from lamindb_setup.types import AnyPathStr\n    from pandas import DataFrame\n    from polars import LazyFrame as PolarsLazyFrame\n    from pyarrow.dataset import Dataset as PyArrowDataset\n\n    from ..core._mapped_collection import MappedCollection\n\n\nUNORDERED_WARNING = (\n    \"this query set is unordered, consider using `.order_by()` first \"\n    \"to avoid opening the artifacts in an arbitrary order\"\n)\n\n\n# maybe make this abstract\nclass ArtifactSet(Iterable):\n    \"\"\"Abstract class representing sets of artifacts returned by queries.\n\n    This class automatically extends :class:`~lamindb.models.BasicQuerySet`\n    and :class:`~lamindb.models.QuerySet` when the base model is :class:`~lamindb.Artifact`.\n\n    Examples:\n\n        >>> artifacts = ln.Artifact.filter(otype=\"AnnData\")\n        >>> artifacts # an instance of ArtifactQuerySet inheriting from ArtifactSet\n    \"\"\"\n\n    @doc_args(Collection.load.__doc__)\n    def load(\n        self,\n        join: Literal[\"inner\", \"outer\"] = \"outer\",\n        is_run_input: bool | None = None,\n        **kwargs,\n    ) -> DataFrame | AnnData:\n        \"\"\"{}\"\"\"  # noqa: D415\n        if not self.ordered:  # type: ignore\n            logger.warning(UNORDERED_WARNING)\n\n        artifacts: list[Artifact] = list(self)\n        concat_object = _load_concat_artifacts(artifacts, join, **kwargs)\n        # track only if successful\n        track_run_input(artifacts, is_run_input)\n        return concat_object\n\n    @doc_args(Collection.open.__doc__)\n    def open(\n        self,\n        engine: Literal[\"pyarrow\", \"polars\"] = \"pyarrow\",\n        is_run_input: bool | None = None,\n        **kwargs,\n    ) -> PyArrowDataset | Iterator[PolarsLazyFrame]:\n        \"\"\"{}\"\"\"  # noqa: D415\n        from ..core.storage._backed_access import _open_dataframe\n\n        if not self.ordered:  # type: ignore\n            logger.warning(UNORDERED_WARNING)\n\n        artifacts: list[Artifact] = list(self)\n        paths: list[UPath] = [artifact.path for artifact in artifacts]\n\n        dataframe = _open_dataframe(paths, engine=engine, **kwargs)\n        # track only if successful\n        track_run_input(artifacts, is_run_input)\n        return dataframe\n\n    @doc_args(Collection.mapped.__doc__)\n    def mapped(\n        self,\n        layers_keys: str | list[str] | None = None,\n        obs_keys: str | list[str] | None = None,\n        obsm_keys: str | list[str] | None = None,\n        obs_filter: dict[str, str | list[str]] | None = None,\n        join: Literal[\"inner\", \"outer\"] | None = \"inner\",\n        encode_labels: bool | list[str] = True,\n        unknown_label: str | dict[str, str] | None = None,\n        cache_categories: bool = True,\n        parallel: bool = False,\n        dtype: str | None = None,\n        stream: bool = False,\n        is_run_input: bool | None = None,\n    ) -> MappedCollection:\n        \"\"\"{}\"\"\"  # noqa: D415\n        from ..core._mapped_collection import MappedCollection\n\n        if not self.ordered:  # type: ignore\n            logger.warning(UNORDERED_WARNING)\n\n        artifacts: list[Artifact] = []\n        paths: list[UPath] = []\n        for artifact in self:\n            if \".h5ad\" not in artifact.suffix and \".zarr\" not in artifact.suffix:\n                logger.warning(f\"ignoring artifact with suffix {artifact.suffix}\")\n                continue\n            elif not stream:\n                paths.append(artifact.cache())\n            else:\n                paths.append(artifact.path)\n            artifacts.append(artifact)\n        ds = MappedCollection(\n            paths,\n            layers_keys,\n            obs_keys,\n            obsm_keys,\n            obs_filter,\n            join,\n            encode_labels,\n            unknown_label,\n            cache_categories,\n            parallel,\n            dtype,\n        )\n        # track only if successful\n        track_run_input(artifacts, is_run_input)\n        return ds\n\n\ndef artifacts_from_path(artifacts: ArtifactSet, path: AnyPathStr) -> ArtifactSet:\n    \"\"\"Returns artifacts in the query set that are registered for the provided path.\"\"\"\n    from lamindb.models import BasicQuerySet, QuerySet\n\n    # not QuerySet but only BasicQuerySet\n    assert isinstance(artifacts, BasicQuerySet) and not isinstance(artifacts, QuerySet)  # noqa: S101\n\n    upath = UPath(path)\n\n    path_str = upath.as_posix()\n\n    stem = upath.stem\n    stem_len = len(stem)\n\n    if stem_len == 16:\n        qs = artifacts.filter(\n            Q(_key_is_virtual=True) | Q(key__isnull=True),\n            _real_key__isnull=True,\n            uid__startswith=stem,\n        )\n    elif stem_len == 20:\n        qs = artifacts.filter(\n            Q(_key_is_virtual=True) | Q(key__isnull=True),\n            _real_key__isnull=True,\n            uid=stem,\n        )\n    else:\n        qs = None\n\n    if qs:  # an empty query set evaluates to False\n        return qs\n\n    qs = (\n        artifacts.filter(Q(_key_is_virtual=False) | Q(_real_key__isnull=False))\n        .alias(\n            db_path=Case(\n                When(\n                    _real_key__isnull=False,\n                    then=Concat(\n                        \"storage__root\",\n                        Value(\"/\"),\n                        \"_real_key\",\n                        output_field=TextField(),\n                    ),\n                ),\n                default=Concat(\n                    \"storage__root\", Value(\"/\"), \"key\", output_field=TextField()\n                ),\n                output_field=TextField(),\n            )\n        )\n        .filter(db_path=path_str)\n    )\n\n    return qs\n"
  },
  {
    "path": "lamindb/models/block.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any, Literal, get_args, overload\n\nfrom django.db import models\nfrom django.db.models import (\n    CASCADE,\n    PROTECT,\n    CharField,\n    DateTimeField,\n    ForeignKey,\n    JSONField,\n    Q,\n    TextField,\n)\nfrom lamin_utils import logger\nfrom lamindb_setup.core.hashing import hash_string\n\nfrom ..base.types import RegistryId\nfrom ..base.uids import base62_16\nfrom ._is_versioned import create_uid, process_revises\nfrom .artifact import Artifact\nfrom .collection import Collection\nfrom .feature import Feature\nfrom .project import Project\nfrom .record import Record\nfrom .run import Run, User, current_user_id\nfrom .schema import Schema\nfrom .sqlrecord import (\n    BaseSQLRecord,\n    Branch,\n    IsVersioned,\n    Space,\n    SQLRecord,\n    init_self_from_db,\n    update_attributes,\n)\nfrom .transform import Transform\n\nif TYPE_CHECKING:\n    from datetime import datetime\n\n    from .query_manager import RelatedManager\n\n_VERSIONED_ATTACHED_KINDS = (\"readme\",)  # only readme is versioned; comment is not\n_VALID_BLOCK_KINDS: tuple[str, ...] = (\"readme\", \"comment\")\n_BLOCK_ALLOWED_NON_REGISTRY_KEYS: tuple[str, ...] = (\"README.md\",)\n\n\ndef _init_versioned_attached_block(\n    self: BaseBlock,\n    fk_field_name: str,\n    *args: Any,\n    allowed_extra: tuple[str, ...] = (),\n    **kwargs: Any,\n) -> None:\n    cls = type(self)\n    if len(args) == len(self._meta.concrete_fields):\n        super(cls, self).__init__(*args, **kwargs)\n        return None\n    if args:\n        raise ValueError(\n            f\"Please only use keyword arguments to construct a {cls.__name__}\"\n        )\n    fk_value = kwargs.pop(fk_field_name, None)\n    content = kwargs.pop(\"content\", None)\n    kind = kwargs.pop(\"kind\", None)\n    version_tag = kwargs.pop(\"version_tag\", kwargs.pop(\"version\", None))\n    revises = kwargs.pop(\"revises\", None)\n    using = kwargs.pop(\"using\", None)\n    uid = kwargs.pop(\"uid\", None) if \"uid\" in kwargs else None\n    default_allowed_extra = (\"branch\", \"branch_id\", \"created_on\", \"created_on_id\")\n    all_allowed_extra = default_allowed_extra + allowed_extra\n    extra_kwargs = {k: kwargs.pop(k) for k in all_allowed_extra if k in kwargs}\n    allowed = {\n        fk_field_name,\n        \"content\",\n        \"kind\",\n        \"version\",\n        \"version_tag\",\n        \"revises\",\n        \"using\",\n        \"uid\",\n        *all_allowed_extra,\n    }\n    if kwargs:\n        raise ValueError(\n            f\"Only {', '.join(sorted(allowed))} can be passed, but you passed: {kwargs}\"\n        )\n    if fk_value is None:\n        raise ValueError(f\"{fk_field_name} is required for {cls.__name__}\")\n    if kind is None:\n        raise ValueError(\n            f\"kind is required for {cls.__name__}; use 'readme' or 'comment'\"\n        )\n    if kind not in _VALID_BLOCK_KINDS:\n        raise ValueError(f\"kind must be 'readme' or 'comment', got {kind!r}\")\n\n    if kind == \"comment\":\n        if revises is not None:\n            raise ValueError(\n                \"revises is not allowed for kind='comment'; comments are not versioned\"\n            )\n        new_uid, _ = create_uid(\n            revises=None,\n            version_tag=version_tag,\n            n_full_id=cls._len_full_uid,\n        )\n        block_hash = hash_string(content) if content else None\n        super(cls, self).__init__(\n            uid=new_uid,\n            content=content or \"\",\n            hash=block_hash,\n            kind=kind,\n            version_tag=version_tag,\n            revises=None,\n            **{fk_field_name: fk_value},\n            **extra_kwargs,\n        )\n        return None\n    # kind == \"readme\" (versioned)\n    if revises is None and fk_value is not None:\n        candidate_for_revises = (\n            cls.objects.using(using)\n            .filter(\n                **{fk_field_name: fk_value},\n                kind=kind,\n                is_latest=True,\n            )\n            .order_by(\"-created_at\")\n            .first()\n        )\n        if candidate_for_revises is not None:\n            revises = candidate_for_revises\n            content_blank = getattr(revises, \"content\", None) in (None, \"\")\n            if content_blank:\n                logger.important(\n                    \"no content was yet saved, returning existing \"\n                    f\"block with same {fk_field_name} and kind\"\n                )\n                uid = revises.uid\n    if revises is not None and uid is not None and uid == revises.uid:\n        init_self_from_db(self, revises)\n        update_attributes(self, {})\n        return None\n    new_uid, revises = create_uid(\n        revises=revises,\n        version_tag=version_tag,\n        n_full_id=cls._len_full_uid,\n    )\n    if uid is None:\n        uid = new_uid\n    block_hash = hash_string(content) if content else None\n    super(cls, self).__init__(\n        uid=uid,\n        content=content or \"\",\n        hash=block_hash,\n        kind=kind,\n        version_tag=version_tag,\n        revises=revises,\n        **{fk_field_name: fk_value},\n        **extra_kwargs,\n    )\n\n\nclass BaseBlock(IsVersioned):\n    class Meta:\n        abstract = True\n\n    _len_full_uid: int = 20\n    _len_stem_uid: int = 16\n\n    id = models.BigAutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(\n        editable=False,\n        unique=True,\n        db_index=True,\n        max_length=_len_full_uid,\n        default=base62_16,\n    )\n    \"\"\"Universal id.\"\"\"\n    content: str = TextField()\n    \"\"\"Content of the block.\"\"\"\n    hash: str = CharField(max_length=22, db_index=True, null=True)\n    \"\"\"Content hash of the block.\"\"\"\n    kind: str = CharField(\n        max_length=22, db_index=True, default=\"readme\", db_default=\"readme\"\n    )\n    \"\"\"The kind of block.\"\"\"\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of creation of record.\"\"\"\n    created_by: User = ForeignKey(\n        \"lamindb.User\", PROTECT, default=current_user_id, related_name=\"+\"\n    )\n    \"\"\"Creator of block.\"\"\"\n    _status_code: int = models.SmallIntegerField(default=0, db_default=0, db_index=True)\n    \"\"\"Status code.\"\"\"\n    _aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True)\n    \"\"\"Auxiliary field for dictionary-like metadata.\"\"\"\n\n\nclass Block(BaseBlock, SQLRecord):\n    \"\"\"An experimental markdown block for anything: issues, standalone markdown pages, comments, etc.\n\n    The `Block` model is experimental and may change in the future.\n    \"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    # same key as in transform/artifact/collection\n    key: str | None = CharField(max_length=1024, db_index=True, null=True)\n    \"\"\"The key for which we want to create a block.\"\"\"\n    anchor: Block | None = ForeignKey(\n        \"Block\", PROTECT, related_name=\"children\", null=True\n    )\n    \"\"\"The anchor of this block.\n\n    For a comment, could be the issue on which the comment is attached.\n\n    For a sub-post, could be the parent post.\n    \"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"Projects that annotate this block.\"\"\"\n    anchors: RelatedManager[Block]\n    \"\"\"This block anchors these blocks.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        key: str | None = None,\n        content: str | None = None,\n        kind: Literal[\"readme\"] = ...,\n        version: str | None = None,\n        revises: Block | None = None,\n        anchor: Block | None = None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        if len(args) == len(self._meta.concrete_fields):\n            super().__init__(*args, **kwargs)\n            return None\n        if args:\n            raise ValueError(\"Please only use keyword arguments to construct a Block\")\n        key = kwargs.pop(\"key\", None)\n        content = kwargs.pop(\"content\", None)\n        revises = kwargs.pop(\"revises\", None)\n        version_tag = kwargs.pop(\"version_tag\", kwargs.pop(\"version\", None))\n        kind = kwargs.pop(\"kind\", None)\n        anchor = kwargs.pop(\"anchor\", None)\n        using = kwargs.pop(\"using\", None)\n        uid = kwargs.pop(\"uid\", None) if \"uid\" in kwargs else None\n        branch = kwargs.pop(\"branch\", None)\n        branch_id = kwargs.pop(\"branch_id\", 1)\n        space = kwargs.pop(\"space\", None)\n        space_id = kwargs.pop(\"space_id\", 1)\n        if kwargs:\n            raise ValueError(\n                \"Only key, content, kind, version, revises, anchor \"\n                f\"can be passed, but you passed: {kwargs}\"\n            )\n        if kind != \"readme\":\n            raise ValueError(\"Only kind = 'readme' is supported for block.\")\n        _registry_ids = get_args(RegistryId)\n        allowed_keys = set(_registry_ids).union(_BLOCK_ALLOWED_NON_REGISTRY_KEYS)\n        if key is not None and key not in allowed_keys:\n            raise ValueError(\n                \"key must be one of RegistryId or \"\n                f\"{', '.join(_BLOCK_ALLOWED_NON_REGISTRY_KEYS)}: \"\n                f\"{', '.join(_registry_ids)}\"\n            )\n        if revises is not None and not isinstance(revises, Block):\n            raise TypeError(\"`revises` has to be of type `Block`\")\n        if revises is None:\n            if uid is not None:\n                revises = (\n                    Block.objects.using(using)\n                    .filter(\n                        uid__startswith=uid[:-4],\n                        is_latest=True,\n                    )\n                    .order_by(\"-created_at\")\n                    .first()\n                )\n            elif key is not None:\n                candidate_for_revises = (\n                    Block.objects.using(using)\n                    .filter(\n                        ~Q(branch_id=-1),\n                        key=key,\n                        is_latest=True,\n                    )\n                    .order_by(\"-created_at\")\n                    .first()\n                )\n                if candidate_for_revises is not None:\n                    revises = candidate_for_revises\n                    content_blank = getattr(candidate_for_revises, \"content\", None) in (\n                        None,\n                        \"\",\n                    )\n                    if content_blank:\n                        logger.important(\n                            \"no content was yet saved, returning existing \"\n                            \"block with same key\"\n                        )\n                        uid = revises.uid\n        if revises is not None and uid is not None and uid == revises.uid:\n            if revises.key != key:\n                logger.warning(\"ignoring inconsistent key\")\n            init_self_from_db(self, revises)\n            update_attributes(self, {})\n            return None\n        if revises is not None and key is not None and revises.key != key:\n            logger.important(f\"renaming block {revises.key} to {key}\")\n        new_uid, version_tag, key, _, revises = process_revises(\n            revises, version_tag, key, None, Block\n        )\n        if uid is None:\n            uid = new_uid\n        block_hash = None\n        if content is not None:\n            block_hash = hash_string(content)\n            block_candidate = Block.objects.filter(\n                ~Q(branch_id=-1),\n                hash=block_hash,\n                is_latest=True,\n            ).first()\n            if block_candidate is not None:\n                init_self_from_db(self, block_candidate)\n                update_attributes(self, {})\n                if key is not None and block_candidate.key != key:\n                    logger.warning(\n                        f\"key {self.key} on existing block differs from \"\n                        f\"passed key {key}, keeping original key\"\n                    )\n                return None\n        super().__init__(\n            uid=uid,\n            key=key,\n            content=content or \"\",\n            kind=kind,\n            version_tag=version_tag,\n            hash=block_hash,\n            revises=revises,\n            anchor=anchor,\n            branch=branch,\n            branch_id=branch_id,\n            space=space,\n            space_id=space_id,\n        )\n\n\nclass HasBranch(models.Model):\n    class Meta:\n        abstract = True\n\n    branch: Branch = ForeignKey(\n        Branch,\n        PROTECT,\n        default=1,\n        db_default=1,\n        related_name=\"+\",\n    )\n    \"\"\"The current branch of the object - changes e.g. on merge events.\"\"\"\n    created_on: Branch = ForeignKey(\n        Branch,\n        PROTECT,\n        default=1,\n        db_default=1,\n        related_name=\"+\",\n    )\n    \"\"\"The branch on which this object was created - never changes.\"\"\"\n\n\nclass RecordBlock(BaseBlock, BaseSQLRecord, HasBranch):\n    \"\"\"An unstructured notes block that can be attached to a record.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"ablocks\")\n    \"\"\"The record to which the block is attached.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        _init_versioned_attached_block(self, \"record\", *args, **kwargs)\n\n\nclass ArtifactBlock(BaseBlock, BaseSQLRecord, HasBranch):\n    \"\"\"An unstructured notes block that can be attached to an artifact.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name=\"ablocks\")\n    \"\"\"The artifact to which the block is attached.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        _init_versioned_attached_block(self, \"artifact\", *args, **kwargs)\n\n\nclass TransformBlock(BaseBlock, BaseSQLRecord, HasBranch):\n    \"\"\"An unstructured notes block that can be attached to a transform.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    transform: Transform = ForeignKey(\n        Transform, CASCADE, related_name=\"ablocks\", null=True\n    )\n    \"\"\"The transform to which the block is attached.\"\"\"\n    line_number: int | None = models.IntegerField(null=True)\n    \"\"\"The line number in the source code to which the block belongs.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        _init_versioned_attached_block(\n            self, \"transform\", *args, allowed_extra=(\"line_number\",), **kwargs\n        )\n\n\nclass RunBlock(BaseBlock, BaseSQLRecord, HasBranch):\n    \"\"\"An unstructured notes block that can be attached to a run.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    run: Run = ForeignKey(Run, CASCADE, related_name=\"ablocks\")\n    \"\"\"The run to which the block is attached.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        _init_versioned_attached_block(self, \"run\", *args, **kwargs)\n\n\nclass CollectionBlock(BaseBlock, BaseSQLRecord, HasBranch):\n    \"\"\"An unstructured notes block that can be attached to a collection.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    collection: Collection = ForeignKey(\n        Collection, CASCADE, related_name=\"ablocks\", null=True\n    )\n    \"\"\"The collection to which the block is attached.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        _init_versioned_attached_block(self, \"collection\", *args, **kwargs)\n\n\nclass SchemaBlock(BaseBlock, BaseSQLRecord, HasBranch):\n    \"\"\"An unstructured notes block that can be attached to a schema.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    schema: Schema = ForeignKey(Schema, CASCADE, related_name=\"ablocks\")\n    \"\"\"The schema to which the block is attached.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        _init_versioned_attached_block(self, \"schema\", *args, **kwargs)\n\n\nclass FeatureBlock(BaseBlock, BaseSQLRecord, HasBranch):\n    \"\"\"An unstructured notes block that can be attached to a feature.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    feature: Feature = ForeignKey(Feature, CASCADE, related_name=\"ablocks\")\n    \"\"\"The feature to which the block is attached.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        _init_versioned_attached_block(self, \"feature\", *args, **kwargs)\n\n\nclass ProjectBlock(BaseBlock, BaseSQLRecord, HasBranch):\n    \"\"\"An unstructured notes block that can be attached to a project.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    project: Project = ForeignKey(Project, CASCADE, related_name=\"ablocks\")\n    \"\"\"The project to which the block is attached.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        _init_versioned_attached_block(self, \"project\", *args, **kwargs)\n\n\nclass SpaceBlock(BaseBlock, BaseSQLRecord, HasBranch):\n    \"\"\"An unstructured notes block that can be attached to a space.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    space: Space = ForeignKey(Space, CASCADE, related_name=\"ablocks\")\n    \"\"\"The space to which the block is attached.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        _init_versioned_attached_block(self, \"space\", *args, **kwargs)\n\n\nclass ULabelBlock(BaseBlock, BaseSQLRecord, HasBranch):\n    \"\"\"An unstructured notes block that can be attached to a ulabel.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    ulabel = ForeignKey(\"ULabel\", CASCADE, related_name=\"ablocks\")\n    \"\"\"The ulabel to which the block is attached.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        _init_versioned_attached_block(self, \"ulabel\", *args, **kwargs)\n\n\nclass BranchBlock(BaseBlock, BaseSQLRecord):\n    \"\"\"An unstructured notes block that can be attached to a branch.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    branch: Branch = ForeignKey(Branch, CASCADE, related_name=\"ablocks\")\n    \"\"\"The branch to which the block is attached.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        _init_versioned_attached_block(self, \"branch\", *args, **kwargs)\n"
  },
  {
    "path": "lamindb/models/can_curate.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Iterable, Literal, Union\n\nimport numpy as np\nfrom django.core.exceptions import FieldDoesNotExist\nfrom django.db.models import Manager, QuerySet\nfrom lamin_utils import colors, logger\n\nfrom lamindb.base.utils import strict_classmethod\n\nfrom ..errors import ValidationError\nfrom ._from_values import (\n    _format_values,\n    _from_values,\n    get_organism_record_from_field,\n)\nfrom .sqlrecord import SQLRecord, get_name_field\n\nif TYPE_CHECKING:\n    from lamin_utils._inspect import InspectResult\n    from pandas import DataFrame\n\n    from lamindb.base.types import ListLike, StrField\n\n    from .query_set import SQLRecordList\n\n\ndef _check_if_record_in_db(record: str | SQLRecord | None, using_key: str | None):\n    \"\"\"Check if the record is from the using_key DB.\"\"\"\n    if isinstance(record, SQLRecord):\n        if using_key is not None and using_key != \"default\":\n            if record._state.db != using_key:\n                raise ValueError(\n                    f\"record must be a {record.__class__.__get_name_with_module__()} record from instance '{using_key}'!\"\n                )\n\n\ndef _concat_lists(values: ListLike | str) -> list[str]:\n    \"\"\"Concatenate a list of lists of strings into a single list.\"\"\"\n    import pandas as pd\n\n    if isinstance(values, str):\n        values = [values]\n    if isinstance(values, (list, pd.Series)) and len(values) > 0:\n        first_item = values[0] if isinstance(values, list) else values.iloc[0]\n        if isinstance(first_item, list):\n            if isinstance(values, pd.Series):\n                values = values.tolist()\n            values = [\n                v for sublist in values if isinstance(sublist, list) for v in sublist\n            ]\n    return values\n\n\ndef _inspect(\n    cls,\n    values: ListLike,\n    field: StrField | None = None,\n    *,\n    mute: bool = False,\n    organism: str | SQLRecord | None = None,\n    source: SQLRecord | None = None,\n    from_source: bool = True,\n    strict_source: bool = False,\n) -> DataFrame | dict[str, list[str]]:\n    \"\"\"{}\"\"\"  # noqa: D415\n    from lamin_utils._inspect import inspect\n\n    values = _concat_lists(values)\n\n    field_str = get_name_field(cls, field=field)\n    queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.filter().all()\n    registry = queryset.model\n    model_name = registry._meta.model.__name__\n    if isinstance(source, SQLRecord):\n        _check_if_record_in_db(source, queryset.db)\n        # if strict_source mode, restrict the query to the passed ontology source\n        # otherwise, inspect across records present in the DB from all ontology sources and no-source\n        if strict_source:\n            queryset = queryset.filter(source=source)\n    organism_record = get_organism_record_from_field(\n        getattr(registry, field_str), organism, values, queryset.db\n    )\n    _check_if_record_in_db(organism_record, queryset.db)\n\n    # do not inspect synonyms if the field is not name field\n    standardize = True\n    if hasattr(registry, \"_name_field\") and field_str != registry._name_field:\n        standardize = False\n\n    # inspect in the DB\n    result_db = inspect(\n        df=_filter_queryset_with_organism(queryset=queryset, organism=organism_record),\n        identifiers=values,\n        field=field_str,\n        standardize=standardize,\n        mute=mute,\n    )\n    nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())\n\n    if from_source and len(nonval) > 0 and hasattr(registry, \"source_id\"):\n        try:\n            public_result = registry.public(\n                organism=organism_record, source=source\n            ).inspect(\n                values=nonval,\n                field=field_str,\n                mute=True,\n                standardize=standardize,\n            )\n            public_validated = public_result.validated\n            public_mapper = public_result.synonyms_mapper\n            hint = False\n            if len(public_validated) > 0 and not mute:\n                print_values = _format_values(public_validated)\n                s = \"\" if len(public_validated) == 1 else \"s\"\n                labels = colors.yellow(f\"{len(public_validated)} {model_name} term{s}\")\n                logger.print(\n                    f\"   detected {labels} in public source for\"\n                    f\" {colors.italic(field_str)}: {colors.yellow(print_values)}\"\n                )\n                hint = True\n\n            if len(public_mapper) > 0 and not mute:\n                print_values = _format_values(list(public_mapper.keys()))\n                s = \"\" if len(public_mapper) == 1 else \"s\"\n                labels = colors.yellow(f\"{len(public_mapper)} {model_name} term{s}\")\n                logger.print(\n                    f\"   detected {labels} in public source as {colors.italic(f'synonym{s}')}:\"\n                    f\" {colors.yellow(print_values)}\"\n                )\n                hint = True\n\n            if hint:\n                logger.print(\n                    f\"→  add records from public source to your {model_name} registry via\"\n                    f\" {colors.italic('.from_values()')}\"\n                )\n\n            nonval = [i for i in public_result.non_validated if i not in public_mapper]  # type: ignore\n        # no public source is found\n        except ValueError:\n            logger.warning(\"no public source found, skipping source validation\")\n\n    if len(nonval) > 0 and not mute:\n        print_values = _format_values(list(nonval))\n        s = \"\" if len(nonval) == 1 else \"s\"\n        labels = colors.red(f\"{len(nonval)} term{s}\")\n        logger.print(f\"   couldn't validate {labels}: {colors.red(print_values)}\")\n        logger.print(\n            f\"→  if you are sure, create new record{s} via\"\n            f\" {colors.italic(f'{registry.__name__}()')} and save to your registry\"\n        )\n\n    return result_db\n\n\ndef _validate(\n    cls,\n    values: ListLike,\n    field: StrField | None = None,\n    *,\n    mute: bool = False,\n    organism: str | SQLRecord | None = None,\n    source: SQLRecord | None = None,\n    strict_source: bool = False,\n) -> np.ndarray:\n    \"\"\"{}\"\"\"  # noqa: D415\n    import pandas as pd\n    from lamin_utils._inspect import validate\n\n    return_str = True if isinstance(values, str) else False\n    values = _concat_lists(values)\n\n    field_str = get_name_field(cls, field=field)\n\n    queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.filter().all()\n    registry = queryset.model\n    if isinstance(source, SQLRecord):\n        _check_if_record_in_db(source, queryset.db)\n        if strict_source:\n            queryset = queryset.filter(source=source)\n\n    organism_record = get_organism_record_from_field(\n        getattr(registry, field_str), organism, values, queryset.db\n    )\n    _check_if_record_in_db(organism_record, queryset.db)\n    field_values = pd.Series(\n        _filter_queryset_with_organism(\n            queryset=queryset,\n            organism=organism_record,\n            values_list_field=field_str,\n        ),\n        dtype=\"object\",\n    )\n    if field_values.empty:\n        if not mute:\n            msg = f\"Your {queryset.model.__name__} registry is empty, consider populating it first!\"\n            if hasattr(queryset.model, \"source_id\"):\n                msg += \"\\n   → use `.import_source()` to import records from a source, e.g. a public ontology\"\n            logger.warning(msg)\n        return np.array([False] * len(values))\n\n    result = validate(\n        identifiers=values,\n        field_values=field_values,\n        case_sensitive=True,\n        mute=mute,\n        field=field_str,\n    )\n    if return_str and len(result) == 1:\n        return result[0]\n    else:\n        return result\n\n\ndef _standardize(\n    cls,\n    values: ListLike,\n    field: StrField | None = None,\n    *,\n    return_field: str = None,\n    return_mapper: bool = False,\n    case_sensitive: bool = False,\n    mute: bool = False,\n    from_source: bool = True,\n    keep: Literal[\"first\", \"last\", False] = \"first\",\n    synonyms_field: str = \"synonyms\",\n    organism: str | SQLRecord | None = None,\n    source: SQLRecord | None = None,\n    strict_source: bool = False,\n) -> list[str] | dict[str, str]:\n    \"\"\"{}\"\"\"  # noqa: D415\n    import pandas as pd\n    from lamin_utils._standardize import standardize as map_synonyms\n\n    return_str = True if isinstance(values, str) else False\n    values = _concat_lists(values)\n\n    field_str = get_name_field(cls, field=field)\n    return_field_str = get_name_field(\n        cls, field=field if return_field is None else return_field\n    )\n    queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.filter().all()\n    registry = queryset.model\n    if isinstance(source, SQLRecord):\n        _check_if_record_in_db(source, queryset.db)\n        if strict_source:\n            queryset = queryset.filter(source=source)\n    organism_record = get_organism_record_from_field(\n        getattr(registry, field_str), organism, values, queryset.db\n    )\n    _check_if_record_in_db(organism_record, queryset.db)\n\n    # only perform synonym mapping if field is the name field\n    if hasattr(registry, \"_name_field\") and field_str != registry._name_field:\n        synonyms_field = None\n\n    try:\n        registry._meta.get_field(synonyms_field)\n        fields = {\n            field_name\n            for field_name in [field_str, return_field_str, synonyms_field]\n            if field_name is not None\n        }\n        df = _filter_queryset_with_organism(\n            queryset=queryset,\n            organism=organism_record,\n            values_list_fields=list(fields),\n        )\n    except FieldDoesNotExist:\n        df = pd.DataFrame()\n\n    _kwargs = {\n        \"field\": field_str,\n        \"return_field\": return_field_str,\n        \"case_sensitive\": case_sensitive,\n        \"keep\": keep,\n        \"synonyms_field\": synonyms_field,\n    }\n    # standardized names from the DB\n    std_names_db = map_synonyms(\n        df=df,\n        identifiers=values,\n        return_mapper=return_mapper,\n        mute=mute,\n        **_kwargs,\n    )\n\n    def _return(result: list, mapper: dict):\n        if return_mapper:\n            return mapper\n        else:\n            if return_str and len(result) == 1:\n                return result[0]\n            return result\n\n    # map synonyms in public source\n    if hasattr(registry, \"source_id\") and from_source:\n        mapper = {}\n        if return_mapper:\n            mapper = std_names_db\n            std_names_db = map_synonyms(\n                df=df, identifiers=values, return_mapper=False, mute=True, **_kwargs\n            )\n\n        val_res = registry.validate(\n            std_names_db, field=field, mute=True, organism=organism_record\n        )\n        if all(val_res):\n            return _return(result=std_names_db, mapper=mapper)\n\n        nonval = np.array(std_names_db)[~val_res]\n        std_names_bt_mapper = registry.public(\n            organism=organism_record, source=source\n        ).standardize(nonval, return_mapper=True, mute=True, **_kwargs)\n\n        if len(std_names_bt_mapper) > 0 and not mute:\n            s = \"\" if len(std_names_bt_mapper) == 1 else \"s\"\n            field_print = \"synonym\" if field_str == return_field_str else field_str\n\n            reduced_mapped_keys_str = f\"{list(std_names_bt_mapper.keys())[:10] + ['...'] if len(std_names_bt_mapper) > 10 else list(std_names_bt_mapper.keys())}\"\n            truncated_note = (\n                \" (output truncated)\" if len(std_names_bt_mapper) > 10 else \"\"\n            )\n\n            warn_msg = (\n                f\"found {len(std_names_bt_mapper)} {field_print}{s} in public source{truncated_note}:\"\n                f\" {reduced_mapped_keys_str}\\n\"\n                f\"  please add corresponding {registry._meta.model.__name__} records via{truncated_note}:\"\n                f\" `.from_values({reduced_mapped_keys_str})`\"\n            )\n\n            logger.warning(warn_msg)\n\n        mapper.update(std_names_bt_mapper)\n        if hasattr(std_names_db, \"dtype\") and isinstance(\n            std_names_db.dtype, pd.CategoricalDtype\n        ):\n            result = std_names_db.cat.rename_categories(std_names_bt_mapper).tolist()\n        else:\n            result = pd.Series(std_names_db).replace(std_names_bt_mapper).tolist()\n        return _return(result=result, mapper=mapper)\n\n    else:\n        return _return(result=std_names_db, mapper=std_names_db)\n\n\ndef _add_or_remove_synonyms(\n    synonym: str | ListLike,\n    record: CanCurate,\n    action: Literal[\"add\", \"remove\"],\n    force: bool = False,\n    save: bool | None = None,\n):\n    \"\"\"Add or remove synonyms.\"\"\"\n\n    def check_synonyms_in_all_records(synonyms: set[str], record: CanCurate):\n        \"\"\"Errors if input synonym is associated with other records in the DB.\"\"\"\n        import pandas as pd\n        from IPython.display import display\n\n        syns_all = (\n            record.__class__.filter().exclude(synonyms=\"\").exclude(synonyms=None)  # type: ignore\n        )\n        if len(syns_all) == 0:\n            return\n        df = pd.DataFrame(syns_all.values())\n        df[\"synonyms\"] = df[\"synonyms\"].str.split(\"|\")\n        df = df.explode(\"synonyms\")\n        matches_df = df[(df[\"synonyms\"].isin(synonyms)) & (df[\"id\"] != record.id)]  # type: ignore\n        if matches_df.shape[0] > 0:\n            records_df = pd.DataFrame(syns_all.filter(id__in=matches_df[\"id\"]).values())\n            logger.error(\n                f\"input synonyms {matches_df['synonyms'].unique()} already associated\"\n                \" with the following records:\\n\"\n            )\n            display(records_df)\n            raise ValidationError(\n                f\"you are trying to assign a synonym to record: {record}\\n\"\n                \"    → consider removing the synonym from existing records or using a different synonym.\"\n            )\n\n    # passed synonyms\n    # nothing happens when passing an empty string or list\n    if isinstance(synonym, str):\n        if len(synonym) == 0:\n            return\n        syn_new_set = {synonym}\n    else:\n        if synonym == [\"\"]:\n            return\n        syn_new_set = set(synonym)\n    # nothing happens when passing an empty string or list\n    if len(syn_new_set) == 0:\n        return\n    # because we use | as the separator\n    if any(\"|\" in i for i in syn_new_set):\n        raise ValidationError(\"a synonym can't contain '|'!\")\n\n    # existing synonyms\n    syns_exist = record.synonyms  # type: ignore\n    if syns_exist is None or len(syns_exist) == 0:\n        syns_exist_set = set()\n    else:\n        syns_exist_set = set(syns_exist.split(\"|\"))\n\n    if action == \"add\":\n        if not force:\n            check_synonyms_in_all_records(syn_new_set, record)\n        syns_exist_set.update(syn_new_set)\n    elif action == \"remove\":\n        syns_exist_set = syns_exist_set.difference(syn_new_set)\n\n    if len(syns_exist_set) == 0:\n        syns_str = None\n    else:\n        syns_str = \"|\".join(syns_exist_set)\n\n    record.synonyms = syns_str  # type: ignore\n\n    if save is None:\n        # if record is already in DB, save the changes to DB\n        save = not record._state.adding  # type: ignore\n    if save:\n        record.save()  # type: ignore\n\n\ndef _check_synonyms_field_exist(record: CanCurate):\n    \"\"\"Check if synonyms field exists.\"\"\"\n    if not hasattr(record, \"synonyms\"):\n        raise NotImplementedError(\n            f\"No synonyms field found in table {record.__class__.__name__}!\"\n        ) from None\n\n\ndef _filter_queryset_with_organism(\n    queryset: QuerySet,\n    organism: SQLRecord | None = None,\n    values_list_field: str | None = None,\n    values_list_fields: list[str] | None = None,\n):\n    \"\"\"Filter a queryset based on organism.\"\"\"\n    import pandas as pd\n\n    if organism is not None:\n        queryset = queryset.filter(organism=organism)\n\n    # values_list_field/s for better performance\n    if values_list_field is None:\n        if values_list_fields:\n            return pd.DataFrame.from_records(\n                queryset.values_list(*values_list_fields), columns=values_list_fields\n            )\n        return pd.DataFrame.from_records(queryset.values())\n    else:\n        return queryset.values_list(values_list_field, flat=True)\n\n\nclass CanCurate:\n    \"\"\"Base class providing :class:`~lamindb.models.SQLRecord`-based validation.\"\"\"\n\n    @strict_classmethod\n    def inspect(\n        cls,\n        values: ListLike,\n        field: StrField | None = None,\n        *,\n        mute: bool = False,\n        organism: Union[str, SQLRecord, None] = None,\n        source: SQLRecord | None = None,\n        from_source: bool = True,\n        strict_source: bool = False,\n    ) -> InspectResult:\n        \"\"\"Inspect if values are mappable to a field.\n\n        Being mappable means that an exact match exists.\n\n        Args:\n            values: Values that will be checked against the field.\n            field: The field of values. Examples are `'ontology_id'` to map\n                against the source ID or `'name'` to map against the ontologies\n                field names.\n            mute: Whether to mute logging.\n            organism: An Organism name or record.\n            source: A `bionty.Source` record that specifies the version to inspect against.\n            strict_source: Determines the validation behavior against records in the registry.\n                - If `False`, validation will include all records in the registry, ignoring the specified source.\n                - If `True`, validation will only include records in the registry  that are linked to the specified source.\n                Note: this parameter won't affect validation against public sources.\n\n        See Also:\n            :meth:`~lamindb.models.CanCurate.validate`\n\n        Example::\n\n            import bionty as bt\n\n            # save some gene records\n            bt.Gene.from_values([\"A1CF\", \"A1BG\", \"BRCA2\"], field=\"symbol\", organism=\"human\").save()\n\n            # inspect gene symbols\n            gene_symbols = [\"A1CF\", \"A1BG\", \"FANCD1\", \"FANCD20\"]\n            result = bt.Gene.inspect(gene_symbols, field=bt.Gene.symbol, organism=\"human\")\n            assert result.validated == [\"A1CF\", \"A1BG\"]\n            assert result.non_validated == [\"FANCD1\", \"FANCD20\"]\n        \"\"\"\n        return _inspect(\n            cls=cls,\n            values=values,\n            field=field,\n            mute=mute,\n            strict_source=strict_source,\n            organism=organism,\n            source=source,\n            from_source=from_source,\n        )\n\n    @strict_classmethod\n    def validate(\n        cls,\n        values: ListLike,\n        field: StrField | None = None,\n        *,\n        mute: bool = False,\n        organism: Union[str, SQLRecord, None] = None,\n        source: SQLRecord | None = None,\n        strict_source: bool = False,\n    ) -> np.ndarray:\n        \"\"\"Validate values against existing values of a string field.\n\n        Note this is strict_source validation, only asserts exact matches.\n\n        Args:\n            values: Values that will be validated against the field.\n            field: The field of values.\n                    Examples are `'ontology_id'` to map against the source ID\n                    or `'name'` to map against the ontologies field names.\n            mute: Whether to mute logging.\n            organism: An Organism name or record.\n            source: A `bionty.Source` record that specifies the version to validate against.\n            strict_source: Determines the validation behavior against records in the registry.\n                - If `False`, validation will include all records in the registry, ignoring the specified source.\n                - If `True`, validation will only include records in the registry  that are linked to the specified source.\n                Note: this parameter won't affect validation against public sources.\n\n        Returns:\n            A vector of booleans indicating if an element is validated.\n\n        See Also:\n            :meth:`~lamindb.models.CanCurate.inspect`\n\n        Example::\n\n            import bionty as bt\n\n            bt.Gene.from_values([\"A1CF\", \"A1BG\", \"BRCA2\"], field=\"symbol\", organism=\"human\").save()\n\n            gene_symbols = [\"A1CF\", \"A1BG\", \"FANCD1\", \"FANCD20\"]\n            bt.Gene.validate(gene_symbols, field=bt.Gene.symbol, organism=\"human\")\n            #> array([ True,  True, False, False])\n        \"\"\"\n        return _validate(\n            cls=cls,\n            values=values,\n            field=field,\n            mute=mute,\n            strict_source=strict_source,\n            organism=organism,\n            source=source,\n        )\n\n    @strict_classmethod\n    def from_values(\n        cls,\n        values: ListLike,\n        field: StrField | None = None,\n        create: bool = False,\n        organism: Union[SQLRecord, str, None] = None,\n        source: SQLRecord | None = None,\n        standardize: bool = True,\n        from_source: bool = True,\n        mute: bool = False,\n    ) -> SQLRecordList:\n        \"\"\"Bulk create validated records by parsing values for an identifier such as a name or an id).\n\n        Args:\n            values: A list of values for an identifier, e.g. `[\"name1\", \"name2\"]`.\n            field: A `SQLRecord` field to look up, e.g., `bt.CellMarker.name`.\n            create: Whether to create records if they don't exist.\n            organism: A `bionty.Organism` name or record.\n            source: A `bionty.Source` record to validate against to create records for.\n            standardize: Whether to standardize synonyms in the values.\n            from_source: Whether to create records from public source.\n            mute: Whether to mute logging.\n\n        Returns:\n            A list of validated records. For bionty registries. Also returns knowledge-coupled records.\n\n        Notes:\n            For more info, see tutorial: :doc:`docs:manage-ontologies`.\n\n        Example::\n\n            import bionty as bt\n\n            # Bulk create from non-validated values will log warnings & returns empty list\n            ulabels = ln.ULabel.from_values([\"benchmark\", \"prediction\", \"test\"])\n            assert len(ulabels) == 0\n\n            # Bulk create records from validated values returns the corresponding existing records\n            ulabels = ln.ULabel.from_values([\"benchmark\", \"prediction\", \"test\"], create=True).save()\n            assert len(ulabels) == 3\n\n            # Bulk create records from public reference\n            bt.CellType.from_values([\"T cell\", \"B cell\"]).save()\n        \"\"\"\n        return _from_values(\n            iterable=values,\n            field=getattr(cls, get_name_field(cls, field=field)),\n            create=create,\n            organism=organism,\n            source=source,\n            mute=mute,\n        )\n\n    @strict_classmethod\n    def standardize(\n        cls,\n        values: Iterable,\n        field: StrField | None = None,\n        *,\n        return_field: StrField | None = None,\n        return_mapper: bool = False,\n        case_sensitive: bool = False,\n        mute: bool = False,\n        from_source: bool = True,\n        keep: Literal[\"first\", \"last\", False] = \"first\",\n        synonyms_field: str = \"synonyms\",\n        organism: Union[str, SQLRecord, None] = None,\n        source: SQLRecord | None = None,\n        strict_source: bool = False,\n    ) -> list[str] | dict[str, str]:\n        \"\"\"Maps input synonyms to standardized names.\n\n        Args:\n            values: Identifiers that will be standardized.\n            field: The field representing the standardized names.\n            return_field: The field to return. Defaults to field.\n            return_mapper: If `True`, returns `{input_value: standardized_name}`.\n            case_sensitive: Whether the mapping is case sensitive.\n            mute: Whether to mute logging.\n            from_source: Whether to standardize from public source. Defaults to `True` for BioRecord registries.\n            keep: When a synonym maps to multiple names, determines which duplicates to mark as `pd.DataFrame.duplicated`:\n                - `\"first\"`: returns the first mapped standardized name\n                - `\"last\"`: returns the last mapped standardized name\n                - `False`: returns all mapped standardized name.\n\n                When `keep` is `False`, the returned list of standardized names will contain nested lists in case of duplicates.\n\n                When a field is converted into return_field, keep marks which matches to keep when multiple return_field values map to the same field value.\n            synonyms_field: A field containing the concatenated synonyms.\n            organism: An Organism name or record.\n            source: A `bionty.Source` record that specifies the version to validate against.\n            strict_source: Determines the validation behavior against records in the registry.\n                - If `False`, validation will include all records in the registry, ignoring the specified source.\n                - If `True`, validation will only include records in the registry  that are linked to the specified source.\n                Note: this parameter won't affect validation against public sources.\n\n        Returns:\n            If `return_mapper` is `False`: a list of standardized names. Otherwise,\n            a dictionary of mapped values with mappable synonyms as keys and\n            standardized names as values.\n\n        See Also:\n            :meth:`~lamindb.models.CanCurate.add_synonym`\n                Add synonyms.\n            :meth:`~lamindb.models.CanCurate.remove_synonym`\n                Remove synonyms.\n\n        Example::\n\n            import bionty as bt\n\n            # save some gene records\n            bt.Gene.from_values([\"A1CF\", \"A1BG\", \"BRCA2\"], field=\"symbol\", organism=\"human\").save()\n\n            # standardize gene synonyms\n            gene_synonyms = [\"A1CF\", \"A1BG\", \"FANCD1\", \"FANCD20\"]\n            bt.Gene.standardize(gene_synonyms)\n            #> ['A1CF', 'A1BG', 'BRCA2', 'FANCD20']\n        \"\"\"\n        return _standardize(\n            cls=cls,\n            values=values,\n            field=field,\n            return_field=return_field,\n            return_mapper=return_mapper,\n            case_sensitive=case_sensitive,\n            mute=mute,\n            strict_source=strict_source,\n            from_source=from_source,\n            keep=keep,\n            synonyms_field=synonyms_field,\n            organism=organism,\n            source=source,\n        )\n\n    def add_synonym(\n        self,\n        synonym: str | ListLike,\n        force: bool = False,\n        save: bool | None = None,\n    ):\n        \"\"\"Add synonyms to a record.\n\n        Args:\n            synonym: The synonyms to add to the record.\n            force: Whether to add synonyms even if they are already synonyms of other records.\n            save: Whether to save the record to the database.\n\n        See Also:\n            :meth:`~lamindb.models.CanCurate.remove_synonym`\n                Remove synonyms.\n\n        Example::\n\n            import bionty as bt\n\n            # save \"T cell\" record\n            record = bt.CellType.from_source(name=\"T cell\").save()\n            record.synonyms\n            #> \"T-cell|T lymphocyte|T-lymphocyte\"\n\n            # add a synonym\n            record.add_synonym(\"T cells\")\n            record.synonyms\n            #> \"T cells|T-cell|T-lymphocyte|T lymphocyte\"\n        \"\"\"\n        _check_synonyms_field_exist(self)\n        _add_or_remove_synonyms(\n            synonym=synonym, record=self, force=force, action=\"add\", save=save\n        )\n\n    def remove_synonym(self, synonym: str | ListLike):\n        \"\"\"Remove synonyms from a record.\n\n        Args:\n            synonym: The synonym values to remove.\n\n        See Also:\n            :meth:`~lamindb.models.CanCurate.add_synonym`\n                Add synonyms\n\n        Example::\n\n            import bionty as bt\n\n            # save \"T cell\" record\n            record = bt.CellType.from_source(name=\"T cell\").save()\n            record.synonyms\n            #> \"T-cell|T lymphocyte|T-lymphocyte\"\n\n            # remove a synonym\n            record.remove_synonym(\"T-cell\")\n            record.synonyms\n            #> \"T lymphocyte|T-lymphocyte\"\n        \"\"\"\n        _check_synonyms_field_exist(self)\n        _add_or_remove_synonyms(synonym=synonym, record=self, action=\"remove\")\n\n    def set_abbr(self, value: str):\n        \"\"\"Set value for abbr field and add to synonyms.\n\n        Args:\n            value: A value for an abbreviation.\n\n        See Also:\n            :meth:`~lamindb.models.CanCurate.add_synonym`\n\n        Example::\n\n            import bionty as bt\n\n            # save an experimental factor record\n            scrna = bt.ExperimentalFactor.from_source(name=\"single-cell RNA sequencing\").save()\n            assert scrna.abbr is None\n            assert scrna.synonyms == \"single-cell RNA-seq|single-cell transcriptome sequencing|scRNA-seq|single cell RNA sequencing\"\n\n            # set abbreviation\n            scrna.set_abbr(\"scRNA\")\n            assert scrna.abbr == \"scRNA\"\n            # synonyms are updated\n            assert scrna.synonyms == \"scRNA|single-cell RNA-seq|single cell RNA sequencing|single-cell transcriptome sequencing|scRNA-seq\"\n        \"\"\"\n        self.abbr = value\n\n        if hasattr(self, \"name\") and value == self.name:\n            pass\n        else:\n            try:\n                self.add_synonym(value, save=False)\n            except Exception as e:  # pragma: no cover\n                logger.debug(\n                    f\"Encountered an Exception while attempting to add synonyms.\\n{e}\"\n                )\n\n        if not self._state.adding:  # type: ignore\n            self.save()  # type: ignore\n"
  },
  {
    "path": "lamindb/models/collection.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any, Literal, overload\n\nfrom django.db import models\nfrom django.db.models import CASCADE, PROTECT, Q\nfrom lamin_utils import logger\nfrom lamindb_setup.core.hashing import HASH_LENGTH, hash_set\n\nfrom lamindb.base.fields import (\n    CharField,\n    ForeignKey,\n    OneToOneField,\n    TextField,\n)\nfrom lamindb.base.utils import strict_classmethod\n\nfrom ..base.uids import base62_20\nfrom ..errors import FieldValidationError\nfrom ..models._is_versioned import process_revises\nfrom ._is_versioned import IsVersioned\nfrom .artifact import (\n    Artifact,\n    get_run,\n    populate_subsequent_run,\n    save_schema_links,\n    track_run_input,\n)\nfrom .has_parents import view_lineage\nfrom .run import Run, TracksRun, TracksUpdates\nfrom .sqlrecord import (\n    BaseSQLRecord,\n    IsLink,\n    SQLRecord,\n    _get_record_kwargs,\n    init_self_from_db,\n    update_attributes,\n)\n\nif TYPE_CHECKING:\n    from collections.abc import Iterable, Iterator\n\n    import anndata as ad\n    import pandas as pd\n    from polars import LazyFrame as PolarsLazyFrame\n    from pyarrow.dataset import Dataset as PyArrowDataset\n\n    from ..core._mapped_collection import MappedCollection\n    from ..core.storage import UPath\n    from .block import CollectionBlock\n    from .project import Project, Reference\n    from .query_manager import RelatedManager\n    from .query_set import QuerySet\n    from .record import Record\n    from .transform import Transform\n    from .ulabel import ULabel\n\n\ndef _load_concat_artifacts(\n    artifacts: list[Artifact], join: Literal[\"inner\", \"outer\"] = \"outer\", **kwargs\n) -> pd.DataFrame | ad.AnnData:\n    import anndata as ad\n    import pandas as pd\n\n    suffixes = {artifact.suffix for artifact in artifacts}\n    if len(suffixes) != 1:\n        raise ValueError(\n            \"Can only load collections where all artifacts have the same suffix\"\n        )\n\n    # because we're tracking data flow on the collection-level, here, we don't\n    # want to track it on the artifact-level\n    first_object = artifacts[0].load(is_run_input=False)\n    is_dataframe = isinstance(first_object, pd.DataFrame)\n    is_anndata = isinstance(first_object, ad.AnnData)\n    if not is_dataframe and not is_anndata:\n        raise ValueError(f\"Unable to concatenate {suffixes.pop()} objects.\")\n\n    objects = [first_object]\n    artifact_uids = [artifacts[0].uid]\n    for artifact in artifacts[1:]:\n        objects.append(artifact.load(is_run_input=False))\n        artifact_uids.append(artifact.uid)\n\n    if is_dataframe:\n        concat_object = pd.concat(objects, join=join, **kwargs)\n    elif is_anndata:\n        label = kwargs.pop(\"label\", \"artifact_uid\")\n        keys = kwargs.pop(\"keys\", artifact_uids)\n        concat_object = ad.concat(objects, join=join, label=label, keys=keys, **kwargs)\n    return concat_object\n\n\nclass Collection(SQLRecord, IsVersioned, TracksRun, TracksUpdates):\n    \"\"\"Versioned collections of artifacts.\n\n    Args:\n        artifacts: `Artifact | list[Artifact]` One or several artifacts.\n        key: `str` A file-path like key, analogous to the `key` parameter of `Artifact` and `Transform`.\n        description: `str | None = None` A description.\n        meta: `Artifact | None = None` An artifact that defines metadata for the collection.\n        reference: `str | None = None` A simple reference, e.g. an external ID or a URL.\n        reference_type: `str | None = None` A way to indicate to indicate the type of the simple reference `\"url\"`.\n        run: `Run | None = None` The run that creates the collection.\n        revises: `Collection | None = None` An old version of the collection.\n        skip_hash_lookup: `bool = False` Skip the hash lookup so that a new collection is created even if a collection with the same hash already exists.\n\n\n    See Also:\n        :class:`~lamindb.Artifact`\n\n    Examples:\n\n        Create a collection from a list of :class:`~lamindb.Artifact` objects::\n\n            collection = ln.Collection([artifact1, artifact2], key=\"my_project/my_collection\")\n\n        Create a collection that groups a data & a metadata artifact (e.g., here :doc:`docs:rxrx`)::\n\n            collection = ln.Collection(data_artifact, key=\"my_project/my_collection\", meta=metadata_artifact)\n\n    \"\"\"\n\n    class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):\n        abstract = False\n        app_label = \"lamindb\"\n        constraints = [\n            models.UniqueConstraint(\n                fields=[\"key\", \"hash\"],\n                name=\"unique_collection_key_hash_not_null\",\n            )\n        ]\n\n    _len_full_uid: int = 20\n    _len_stem_uid: int = 16\n    _name_field: str = \"key\"\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(\n        editable=False,\n        unique=True,\n        db_index=True,\n        max_length=_len_full_uid,\n        default=base62_20,\n    )\n    \"\"\"Universal id, valid across DB instances.\"\"\"\n    key: str = CharField(db_index=True)\n    \"\"\"Name or path-like key.\"\"\"\n    # below is the only case in which we use a TextField\n    # for description; we do so because users had descriptions exceeding 255 chars\n    # in their instances\n    description: str | None = TextField(null=True)\n    \"\"\"A description or title.\"\"\"\n    hash: str | None = CharField(\n        max_length=HASH_LENGTH,\n        db_index=True,\n        null=True,\n    )\n    \"\"\"Hash of collection content.\"\"\"\n    reference: str | None = CharField(max_length=255, db_index=True, null=True)\n    \"\"\"A reference like URL or external ID.\"\"\"\n    # also for reference_type here, we allow an extra long max_length\n    reference_type: str | None = CharField(max_length=25, db_index=True, null=True)\n    \"\"\"Type of reference, e.g., cellxgene Census collection_id.\"\"\"\n    ulabels: RelatedManager[ULabel] = models.ManyToManyField(\n        \"ULabel\", through=\"CollectionULabel\", related_name=\"collections\"\n    )\n    \"\"\"ULabels annotating the collection (see :class:`~lamindb.Feature`) ← :attr:`~lamindb.ULabel.collections`.\"\"\"\n    run: Run | None = ForeignKey(\n        Run, PROTECT, related_name=\"output_collections\", null=True, default=None\n    )\n    \"\"\":class:`~lamindb.Run` that created the `collection` ← :attr:`~lamindb.Run.output_collections`.\"\"\"\n    input_of_runs: RelatedManager[Run] = models.ManyToManyField(\n        Run, related_name=\"input_collections\"\n    )\n    \"\"\"Runs that use this collection as an input ← :attr:`~lamindb.Run.input_collections`.\"\"\"\n    recreating_runs: RelatedManager[Run] = models.ManyToManyField(\n        \"Run\",\n        related_name=\"recreated_collections\",\n    )\n    \"\"\"Runs that re-created the record after initial creation ← :attr:`~lamindb.Run.recreated_collections`.\"\"\"\n    artifacts: RelatedManager[Artifact] = models.ManyToManyField(\n        \"Artifact\", related_name=\"collections\", through=\"CollectionArtifact\"\n    )\n    \"\"\"Artifacts in collection ← :attr:`~lamindb.Artifact.collections`.\"\"\"\n    meta_artifact: Artifact | None = OneToOneField(\n        \"Artifact\",\n        PROTECT,\n        null=True,\n        unique=True,\n        related_name=\"_meta_of_collection\",\n    )\n    \"\"\"An artifact that stores metadata that indexes a collection.\n\n    It has a 1:1 correspondence with an artifact. If needed, you can access the\n    collection from the artifact via a private field:\n    `artifact._meta_of_collection`.\n    \"\"\"\n    linked_in_records: RelatedManager[Record] = models.ManyToManyField(\n        \"Record\", through=\"RecordCollection\", related_name=\"linked_collections\"\n    )\n    \"\"\"This collection is linked in these records as a value ← :attr:`~lamindb.Record.linked_collections`.\"\"\"\n    _actions: RelatedManager[Artifact] = models.ManyToManyField(\n        Artifact, related_name=\"+\"\n    )\n    \"\"\"Actions to attach for the UI.\"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"Linked projects ← :attr:`~lamindb.Project.collections`.\"\"\"\n    references: RelatedManager[Reference]\n    \"\"\"Linked references ← :attr:`~lamindb.Reference.collections`.\"\"\"\n    records: RelatedManager[Record]\n    \"\"\"Linked records ← :attr:`~lamindb.Record.collections`.\"\"\"\n    ablocks: RelatedManager[CollectionBlock]\n    \"\"\"Attached blocks ← :attr:`~lamindb.CollectionBlock.collection`.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        artifacts: Artifact | list[Artifact],\n        key: str,\n        description: str | None = None,\n        meta: Any | None = None,\n        reference: str | None = None,\n        reference_type: str | None = None,\n        run: Run | None = None,\n        revises: Collection | None = None,\n        skip_hash_lookup: bool = False,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        if len(args) == len(self._meta.concrete_fields):\n            super().__init__(*args, **kwargs)\n            return None\n        # now we proceed with the user-facing constructor\n        if len(args) > 1:\n            raise ValueError(\"Only one non-keyword arg allowed: artifacts\")\n        artifacts: Artifact | list[Artifact] = (\n            kwargs.pop(\"artifacts\") if len(args) == 0 else args[0]\n        )\n        meta_artifact: Artifact | None = kwargs.pop(\"meta_artifact\", None)\n        key: str | None = kwargs.pop(\"key\", None)\n        description: str | None = kwargs.pop(\"description\", None)\n        reference: str | None = kwargs.pop(\"reference\", None)\n        reference_type: str | None = kwargs.pop(\"reference_type\", None)\n        run: Run | None = kwargs.pop(\"run\", None)\n        revises: Collection | None = kwargs.pop(\"revises\", None)\n        version_tag: str | None = kwargs.pop(\"version_tag\", kwargs.pop(\"version\", None))\n        skip_hash_lookup: bool = kwargs.pop(\"skip_hash_lookup\", False)\n        branch = kwargs.pop(\"branch\", None)\n        branch_id = kwargs.pop(\"branch_id\", 1)\n        space = kwargs.pop(\"space\", None)\n        space_id = kwargs.pop(\"space_id\", 1)\n        if not len(kwargs) == 0:\n            valid_keywords = \", \".join(\n                [val[0] for val in _get_record_kwargs(Collection)]\n            )\n            raise FieldValidationError(\n                f\"Only {valid_keywords} can be passed, you passed: {kwargs}\"\n            )\n        if revises is None:\n            revises = (\n                Collection.filter(key=key, is_latest=True)\n                .order_by(\"-created_at\")\n                .first()\n            )\n        provisional_uid, version_tag, key, description, revises = process_revises(\n            revises, version_tag, key, description, Collection\n        )\n        run = get_run(run)\n        if isinstance(artifacts, Artifact):\n            artifacts = [artifacts]\n        else:\n            if not hasattr(artifacts, \"__getitem__\"):\n                raise ValueError(\"Artifact or list[Artifact] is allowed.\")\n            assert isinstance(artifacts[0], Artifact)  # type: ignore  # noqa: S101\n        hash = from_artifacts(artifacts)  # type: ignore\n        if meta_artifact is not None:\n            if not isinstance(meta_artifact, Artifact):\n                raise ValueError(\"meta_artifact has to be an Artifact\")\n            if isinstance(meta_artifact, Artifact):\n                if meta_artifact._state.adding:\n                    raise ValueError(\n                        \"Save meta_artifact artifact before creating collection!\"\n                    )\n        # we ignore collections in trash containing the same hash\n        if hash is not None and not skip_hash_lookup:\n            # this purposefully leaves out the key that we have\n            # in the hard database unique constraint\n            # so that the user is able to find collections with the same hash across\n            # keys\n            # if this is not desired, set skip_hash_lookup=True\n            existing_collection = Collection.objects.filter(\n                ~Q(branch_id=-1),\n                hash=hash,\n            ).first()\n        else:\n            existing_collection = None\n        if existing_collection is not None:\n            logger.warning(\n                f\"returning collection with same hash: {existing_collection}; if you intended to query to track this collection as an input, use: ln.Collection.get()\"\n            )\n            init_self_from_db(self, existing_collection)\n            update_attributes(self, {\"description\": description, \"key\": key})\n            populate_subsequent_run(self, run)\n        else:\n            _skip_validation = revises is not None and key == revises.key\n            super().__init__(  # type: ignore\n                uid=provisional_uid,\n                key=key,\n                description=description,\n                reference=reference,\n                reference_type=reference_type,\n                meta_artifact=meta_artifact,\n                hash=hash,\n                run=run,\n                version_tag=version_tag,\n                branch=branch,\n                branch_id=branch_id,\n                space=space,\n                space_id=space_id,\n                revises=revises,\n                _skip_validation=_skip_validation,\n            )\n        self._artifacts = artifacts\n        if revises is not None and revises.uid != self.uid:\n            track_run_input(revises, run=run)\n        track_run_input(artifacts, run=run)\n\n    @strict_classmethod\n    def get(\n        cls,\n        idlike: int | str | None = None,\n        *,\n        is_run_input: bool | Run = False,\n        **expressions,\n    ) -> Artifact:\n        \"\"\"Get a single collection.\n\n        Args:\n            idlike: Either a uid stub, uid or an integer id.\n            is_run_input: Whether to track this collection as run input.\n            expressions: Fields and values passed as Django query expressions.\n\n        Raises:\n            :exc:`lamindb.errors.DoesNotExist`: In case no matching record is found.\n\n        See Also:\n            - Method in `SQLRecord` base class: :meth:`~lamindb.models.SQLRecord.get`\n\n        Examples:\n\n            ::\n\n                collection = ln.Collection.get(\"okxPW6GIKBfRBE3B0000\")\n                collection = ln.Collection.get(key=\"scrna/collection1\")\n        \"\"\"\n        from .query_set import QuerySet\n\n        return QuerySet(model=cls).get(idlike, is_run_input=is_run_input, **expressions)\n\n    def append(self, artifact: Artifact, run: Run | None = None) -> Collection:\n        \"\"\"Append an artifact to the collection.\n\n        This does not modify the original collection in-place, but returns a new version\n        of the original collection with the appended artifact.\n\n        Args:\n            artifact: An artifact to add to the collection.\n            run: The run that creates the new version of the collection.\n\n        Examples:\n\n            ::\n\n                collection_v1 = ln.Collection(artifact, key=\"My collection\").save()\n                collection_v2 = collection.append(another_artifact)  # returns a new version of the collection\n                collection_v2.save()  # save the new version\n\n        \"\"\"\n        return Collection(  # type: ignore\n            self.artifacts.all().to_list() + [artifact],\n            # key is automatically derived from revises.key\n            description=self.description,\n            revises=self,\n            run=run,\n        )\n\n    def open(\n        self,\n        engine: Literal[\"pyarrow\", \"polars\"] = \"pyarrow\",\n        is_run_input: bool | None = None,\n        **kwargs,\n    ) -> PyArrowDataset | Iterator[PolarsLazyFrame]:\n        \"\"\"Open a dataset for streaming.\n\n        Works for `pyarrow` and `polars` compatible formats\n        (`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).\n\n        Args:\n            engine: Which module to use for lazy loading of a dataframe\n                from `pyarrow` or `polars` compatible formats.\n            is_run_input: Whether to track this artifact as run input.\n            **kwargs: Keyword arguments for `pyarrow.dataset.dataset` or `polars.scan_*` functions.\n\n        Notes:\n            For more info, see guide: :doc:`/arrays`.\n        \"\"\"\n        if self._state.adding:\n            artifacts = self._artifacts\n            logger.warning(\"the collection isn't saved, consider calling `.save()`\")\n        else:\n            artifacts = self.ordered_artifacts.all()\n        paths = [artifact.path for artifact in artifacts]\n\n        from ..core.storage._backed_access import _open_dataframe\n\n        dataframe = _open_dataframe(paths, engine=engine, **kwargs)\n        # track only if successful\n        track_run_input(self, is_run_input)\n        return dataframe\n\n    def mapped(\n        self,\n        layers_keys: str | list[str] | None = None,\n        obs_keys: str | list[str] | None = None,\n        obsm_keys: str | list[str] | None = None,\n        obs_filter: dict[str, str | list[str]] | None = None,\n        join: Literal[\"inner\", \"outer\"] | None = \"inner\",\n        encode_labels: bool | list[str] = True,\n        unknown_label: str | dict[str, str] | None = None,\n        cache_categories: bool = True,\n        parallel: bool = False,\n        dtype: str | None = None,\n        stream: bool = False,\n        is_run_input: bool | None = None,\n    ) -> MappedCollection:\n        \"\"\"Return a map-style dataset.\n\n        Returns a `pytorch map-style dataset\n        <https://pytorch.org/docs/stable/data.html#map-style-datasets>`__ by\n        virtually concatenating `AnnData` arrays.\n\n        By default (`stream=False`) `AnnData` arrays are moved into a local\n        cache first.\n\n        `__getitem__` of the `MappedCollection` object takes a single integer index\n        and returns a dictionary with the observation data sample for this index from\n        the `AnnData` objects in the collection. The dictionary has keys for `layers_keys`\n        (`.X` is in `\"X\"`), `obs_keys`, `obsm_keys` (under `f\"obsm_{key}\"`) and also `\"_store_idx\"`\n        for the index of the `AnnData` object containing this observation sample.\n\n        .. note::\n\n            For a guide, see :doc:`docs:scrna-mappedcollection`.\n\n            This method currently only works for collections or query sets of `AnnData` artifacts.\n\n        Args:\n            layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``\"X\"`` in the list\n                retrieves ``.X``.\n            obs_keys: Keys from the ``.obs`` slots.\n            obsm_keys: Keys from the ``.obsm`` slots.\n            obs_filter: Select only observations with these values for the given obs columns.\n                Should be a dictionary with obs column names as keys\n                and filtering values (a string or a list of strings) as values.\n            join: `\"inner\"` or `\"outer\"` virtual joins. If ``None`` is passed,\n                does not join.\n            encode_labels: Encode labels into integers.\n                Can be a list with elements from ``obs_keys``.\n            unknown_label: Encode this label to -1.\n                Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``\n                or from ``encode_labels`` if it is a list.\n            cache_categories: Enable caching categories of ``obs_keys`` for faster access.\n            parallel: Enable sampling with multiple processes.\n            dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``\n            stream: Whether to stream data from the array backend.\n            is_run_input: Whether to track this collection as run input.\n\n        Examples:\n            >>> import lamindb as ln\n            >>> from torch.utils.data import DataLoader\n            >>> ds = ln.Collection.get(description=\"my collection\")\n            >>> mapped = collection.mapped(obs_keys=[\"cell_type\", \"batch\"])\n            >>> dl = DataLoader(mapped, batch_size=128, shuffle=True)\n            >>> # also works for query sets of artifacts, '...' represents some filtering condition\n            >>> # additional filtering on artifacts of the collection\n            >>> mapped = collection.artifacts.all().filter(...).order_by(\"-created_at\").mapped()\n            >>> # or directly from a query set of artifacts\n            >>> mapped = ln.Artifact.filter(..., otype=\"AnnData\").order_by(\"-created_at\").mapped()\n        \"\"\"\n        from ..core._mapped_collection import MappedCollection\n\n        path_list = []\n        if self._state.adding:\n            artifacts = self._artifacts\n            logger.warning(\"the collection isn't saved, consider calling `.save()`\")\n        else:\n            artifacts = self.ordered_artifacts.all()\n        for artifact in artifacts:\n            if \".h5ad\" not in artifact.suffix and \".zarr\" not in artifact.suffix:\n                logger.warning(f\"ignoring artifact with suffix {artifact.suffix}\")\n                continue\n            elif not stream:\n                path_list.append(artifact.cache())\n            else:\n                path_list.append(artifact.path)\n        ds = MappedCollection(\n            path_list,\n            layers_keys,\n            obs_keys,\n            obsm_keys,\n            obs_filter,\n            join,\n            encode_labels,\n            unknown_label,\n            cache_categories,\n            parallel,\n            dtype,\n        )\n        # track only if successful\n        track_run_input(self, is_run_input)\n        return ds\n\n    def cache(self, is_run_input: bool | None = None) -> list[UPath]:\n        \"\"\"Download cloud artifacts in collection to local cache.\n\n        Follows syncing logic: only downloads outdated artifacts.\n\n        Returns ordered paths to locally cached on-disk artifacts via `.ordered_artifacts.all()`:\n\n        Args:\n            is_run_input: Whether to track this collection as run input.\n        \"\"\"\n        path_list = []\n        for artifact in self.ordered_artifacts.all():\n            # do not want to track data lineage on the artifact level\n            path_list.append(artifact.cache(is_run_input=False))\n        track_run_input(self, is_run_input)\n        return path_list\n\n    def load(\n        self,\n        join: Literal[\"inner\", \"outer\"] = \"outer\",\n        is_run_input: bool | None = None,\n        **kwargs,\n    ) -> pd.DataFrame | ad.AnnData:\n        \"\"\"Cache and load to memory.\n\n        Returns an in-memory concatenated `DataFrame` or `AnnData` object.\n        \"\"\"\n        # cannot call track_run_input here, see comment further down\n        artifacts = self.ordered_artifacts.all()\n        concat_object = _load_concat_artifacts(artifacts, join, **kwargs)\n        # only call it here because there might be errors during load or concat\n        track_run_input(self, is_run_input)\n        return concat_object\n\n    def save(self, using: str | None = None) -> Collection:\n        \"\"\"Save the collection and underlying artifacts to database & storage.\n\n        Args:\n            using: The database to which you want to save.\n\n        Examples:\n            >>> collection = ln.Collection(\"./myfile.csv\", name=\"myfile\")\n        \"\"\"\n        if self.meta_artifact is not None:\n            self.meta_artifact.save()\n        super().save()\n        # we don't allow updating the collection of artifacts\n        # if users want to update the set of artifacts, they\n        # have to create a new collection\n        if hasattr(self, \"_artifacts\"):\n            links = [\n                CollectionArtifact(collection_id=self.id, artifact_id=artifact.id)  # type: ignore\n                for artifact in self._artifacts\n            ]\n            # the below seems to preserve the order of the list in the\n            # auto-incrementing integer primary\n            # merely using .artifacts.set(*...) doesn't achieve this\n            # we need ignore_conflicts=True so that this won't error if links already exist\n            CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)\n        save_schema_links(self)\n        if using is not None:\n            logger.warning(\"using argument is ignored\")\n        return self\n\n    def restore(self) -> None:\n        \"\"\"Restore collection record from trash.\n\n        Examples:\n\n            For any `Collection` object `collection`, call:\n\n            >>> collection.restore()\n        \"\"\"\n        self.branch_id = 1\n        self.save()\n\n    @property\n    def transform(self) -> Transform | None:\n        \"\"\"Transform whose run created the collection.\"\"\"\n        return self.run.transform if self.run is not None else None\n\n    @property\n    def name(self) -> str:\n        \"\"\"Name of the collection.\n\n        Splits `key` on `/` and returns the last element.\n        \"\"\"\n        return self.key.split(\"/\")[-1]\n\n    @property\n    def ordered_artifacts(self) -> QuerySet:\n        \"\"\"Ordered `QuerySet` of `.artifacts`.\n\n        Accessing the many-to-many field `collection.artifacts` directly gives\n        you non-deterministic order.\n\n        Using the property `.ordered_artifacts` allows to iterate through a set\n        that's ordered by the order of the list that created the collection.\n        \"\"\"\n        return self.artifacts.order_by(\"links_collection__id\")\n\n    @property\n    def data_artifact(self) -> Artifact | None:\n        \"\"\"Access to a single data artifact.\n\n        If the collection has a single data & metadata artifact, this allows access via::\n\n           collection.data_artifact  # first & only element of collection.artifacts\n           collection.meta_artifact  # metadata\n\n        \"\"\"\n        return self.artifacts.first()\n\n\n# internal function, not exposed to user\ndef from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:\n    # assert all artifacts are already saved\n    saved = not any(artifact._state.adding for artifact in artifacts)\n    if not saved:\n        raise ValueError(\"Not all artifacts are yet saved, please save them\")\n    # validate consistency of hashes - we do not allow duplicate hashes\n    hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]\n    hashes_set = set(hashes)\n    if len(hashes) != len(hashes_set):\n        seen = set()\n        non_unique = [x for x in hashes if x in seen or seen.add(x)]  # type: ignore\n        logger.warning(\n            f\"your collection contains artifacts with non-unique hashes:  {non_unique}\"\n        )\n    hash = hash_set(hashes_set)\n    return hash\n\n\nclass CollectionArtifact(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    collection: Collection = ForeignKey(\n        Collection, CASCADE, related_name=\"links_artifact\"\n    )\n    artifact: Artifact = ForeignKey(Artifact, PROTECT, related_name=\"links_collection\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"collection\", \"artifact\")\n\n\n# mypy: ignore-errors\nCollection.view_lineage = view_lineage\n"
  },
  {
    "path": "lamindb/models/feature.py",
    "content": "from __future__ import annotations\n\nimport importlib\nimport warnings\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Any, cast, get_args, overload\n\nimport numpy as np\nimport pgtrigger\nfrom django.conf import settings as django_settings\nfrom django.db import connection, models\nfrom django.db.models import CASCADE, PROTECT\nfrom django.db.models.query_utils import DeferredAttribute\nfrom django.db.utils import IntegrityError as DjangoIntegrityError\nfrom lamin_utils import logger\nfrom lamindb_setup._init_instance import get_schema_module_name\nfrom lamindb_setup.core import deprecated\nfrom lamindb_setup.core.hashing import HASH_LENGTH, hash_dict, hash_string\nfrom lamindb_setup.errors import (\n    MODULE_WASNT_CONFIGURED_MESSAGE_TEMPLATE,\n    ModuleWasntConfigured,\n)\n\nfrom lamindb.base.fields import (\n    BooleanField,\n    CharField,\n    ForeignKey,\n    JSONField,\n    TextField,\n)\nfrom lamindb.base.types import DtypeStr, FieldAttr\nfrom lamindb.errors import (\n    FieldValidationError,\n    IntegrityError,\n    InvalidArgument,\n    ValidationError,\n)\n\nfrom ..base.uids import base62_12\nfrom ._relations import dict_module_name_to_model_name\nfrom .can_curate import CanCurate\nfrom .has_parents import _query_relatives\nfrom .query_set import QuerySet, SQLRecordList\nfrom .run import (\n    TracksRun,\n    TracksUpdates,\n)\nfrom .sqlrecord import BaseSQLRecord, HasType, Registry, SQLRecord, _get_record_kwargs\n\nif TYPE_CHECKING:\n    from collections.abc import Iterable\n\n    import pandas as pd\n    from pandas.core.dtypes.base import ExtensionDtype\n\n    from .artifact import Artifact\n    from .block import FeatureBlock\n    from .project import Project\n    from .query_manager import RelatedManager\n    from .record import Record\n    from .run import Run\n    from .schema import Schema\n    from .ulabel import ULabel\n\nFEATURE_DTYPES = set(get_args(DtypeStr))\n\n\n@dataclass(frozen=True)\nclass FeaturePredicate:\n    \"\"\"Predicate generated by comparing a Feature to a value.\"\"\"\n\n    feature: Feature\n    comparator: str\n    value: Any\n\n    def __bool__(self) -> bool:\n        raise TypeError(\n            \"Feature predicates cannot be used as booleans. \"\n            \"Pass them into `.filter(...)`.\"\n        )\n\n\ndef parse_dtype(\n    dtype_str: str, check_exists: bool = False, old_format: bool = False\n) -> list[dict[str, Any]]:\n    \"\"\"Parses feature data type string into a structured list of components.\"\"\"\n    from .artifact import Artifact\n\n    allowed_dtypes = FEATURE_DTYPES\n\n    # Handle list[...] types\n    if dtype_str.startswith(\"list[\") and dtype_str.endswith(\"]\"):\n        inner_dtype_str = dtype_str[5:-1]  # Remove \"list[\" and \"]\"\n        # Recursively parse the inner type\n        inner_result = parse_dtype(inner_dtype_str, old_format=old_format)\n        # Add \"list\": True to each component\n        for component in inner_result:\n            if isinstance(component, dict):\n                component[\"list\"] = True  # type: ignore\n        return inner_result\n\n    is_composed_cat = dtype_str.startswith(\"cat[\") and dtype_str.endswith(\"]\")\n    result: list[dict[str, Any]] = []\n    # backward compatibility for bare \"cat\" dtype (deprecated)\n    if dtype_str == \"cat\":\n        return result\n    if is_composed_cat:\n        related_registries = dict_module_name_to_model_name(Artifact)\n        registries_str = dtype_str.replace(\"cat[\", \"\")[:-1]  # strip last ]\n        if registries_str != \"\":\n            registry_str_list = registries_str.split(\"|\")\n            for cat_single_dtype_str in registry_str_list:\n                single_result = parse_cat_dtype(\n                    cat_single_dtype_str,\n                    related_registries=related_registries,\n                    check_exists=check_exists,\n                    old_format=old_format,\n                )\n                result.append(single_result)\n    elif dtype_str not in allowed_dtypes:\n        raise ValueError(\n            f\"dtype is '{dtype_str}' but has to be one of {FEATURE_DTYPES}!\"\n        )\n    return result\n\n\ndef get_record_type_from_uid(\n    registry: Registry,\n    record_uid: str,\n) -> SQLRecord:\n    type_record: SQLRecord = registry.get(record_uid)\n\n    if type_record.branch_id == -1:\n        warning_msg = f\"retrieving {registry.__name__} type '{type_record.name}' (uid='{record_uid}') from trash\"\n        logger.warning(warning_msg)\n\n    if not type_record.is_type:\n        raise InvalidArgument(\n            f\"The resolved {type_record.__class__.__name__} '{type_record.name}' (uid='{record_uid}') is not a type: is_type is False.\"\n        )\n    return type_record\n\n\ndef get_record_type_from_nested_subtypes(\n    registry: Registry, subtypes_list: list[str], field_str: str\n) -> SQLRecord:\n    \"\"\"Get a record type by querying nested subtypes using raw SQL.\n\n    This function only works with Record or ULabel registries.\n    \"\"\"\n    table_name = registry._meta.db_table\n    final_name = subtypes_list[-1]\n\n    # Build the SQL query with nested joins\n    # For subtypes_list = [\"A\", \"B\", \"C\"], we want:\n    # - Record with name=\"C\"\n    # - Its type has name=\"B\"\n    # - That type's type has name=\"A\"\n\n    params: list[str | bool]\n    if len(subtypes_list) > 1:\n        # Build nested joins for parent types\n        parent_types = list(reversed(subtypes_list[:-1]))\n        joins = []\n        where_clauses = [\"t0.name = %s\"]  # Final record name\n        params = [final_name]\n\n        for i, parent_type_name in enumerate(parent_types):\n            alias = f\"t{i + 1}\"\n            prev_alias = f\"t{i}\"\n            joins.append(\n                f\"INNER JOIN {table_name} {alias} ON {prev_alias}.type_id = {alias}.id\"\n            )\n            where_clauses.append(f\"{alias}.name = %s\")\n            where_clauses.append(f\"{alias}.is_type = %s\")\n            params.extend([parent_type_name, True])\n\n        join_clause = \" \".join(joins)\n        where_clause = \" AND \".join(where_clauses)\n\n        query = f\"\"\"\n            SELECT t0.*\n            FROM {table_name} t0\n            {join_clause}\n            WHERE {where_clause}\n            LIMIT 1\n        \"\"\"\n    else:\n        # Single type, no parent - type must be NULL\n        query = f\"\"\"\n            SELECT *\n            FROM {table_name}\n            WHERE name = %s AND type_id IS NULL\n            LIMIT 1\n        \"\"\"\n        params = [final_name]\n\n    try:\n        with connection.cursor() as cursor:\n            cursor.execute(query, params)\n            columns = [col[0] for col in cursor.description]\n            rows = cursor.fetchall()\n\n            if not rows:\n                raise IntegrityError(\n                    f\"No {registry.__name__} type found matching subtypes {subtypes_list} for field `.{field_str}`\"\n                )\n\n            if len(rows) > 1:\n                raise IntegrityError(\n                    f\"Multiple {registry.__name__} types found matching subtypes {subtypes_list} for field `.{field_str}`\"\n                )\n\n            # Create a dictionary from the row data\n            row_dict = dict(zip(columns, rows[0]))\n\n            # Create a minimal mock object with only the fields we need\n            # This avoids querying the database which may not have all columns during migrations\n            # We create a simple object and set its class to the registry for proper error messages\n            type_record: SQLRecord = object.__new__(registry)\n            type_record.id = row_dict.get(\"id\")\n            type_record.uid = row_dict.get(\"uid\")\n            type_record.name = row_dict.get(\"name\")\n            type_record.is_type = row_dict.get(\"is_type\", False)\n            # Initialize _state attribute needed by Django models\n            # Create a minimal state object with the required attributes\n            state = type(\"ModelState\", (), {\"adding\": False, \"db\": \"default\"})()\n            type_record._state = state\n\n    except IntegrityError:\n        raise\n    except Exception as e:\n        raise IntegrityError(\n            f\"Error retrieving {registry.__name__} type with subtypes {subtypes_list} for field `.{field_str}`: {e}\"\n        ) from e\n\n    if not type_record.is_type:\n        raise InvalidArgument(\n            f\"The resolved {type_record.__class__.__name__} '{type_record.name}' for field `.{field_str}` is not a type: is_type is False.\"\n        )\n    return type_record\n\n\ndef dtype_as_object(dtype_str: str, old_format: bool = False) -> type | None:\n    def _dtype_as_object_simple(dtype_str: str) -> type | None:\n        if dtype_str == \"str\":\n            return str\n        elif dtype_str == \"url\":\n            return str\n        elif dtype_str == \"int\":\n            return int\n        elif dtype_str in (\"float\", \"num\"):\n            return float\n        elif dtype_str == \"bool\":\n            return bool\n        elif dtype_str == \"date\":\n            from datetime import date\n\n            return date\n        elif dtype_str == \"datetime\":\n            from datetime import datetime\n\n            return datetime\n        elif dtype_str.startswith(\"dict\"):\n            return dict\n        return None\n\n    if dtype_str is None:\n        return None\n\n    parsed_dtypes = parse_dtype(dtype_str, check_exists=True, old_format=old_format)\n    if len(parsed_dtypes) > 0:\n        dtype_objects = []\n        for parsed_dtype in parsed_dtypes:\n            if parsed_dtype.get(\"record_uid\"):\n                # return the subtype record for dtypes with record_uid\n                dtype_object = get_record_type_from_uid(\n                    parsed_dtype[\"registry\"],\n                    parsed_dtype[\"record_uid\"],\n                )\n            elif parsed_dtype.get(\"subtypes_list\"):\n                dtype_object = get_record_type_from_nested_subtypes(\n                    parsed_dtype[\"registry\"],\n                    parsed_dtype[\"subtypes_list\"],\n                    parsed_dtype[\"field\"],\n                )\n            else:\n                # return field for dtypes without record_uid, e.g. bt.CellType.ontology_id\n                dtype_object = parsed_dtype[\"field\"]\n            # for list, returns list[SQLRecord]\n            dtype_objects.append(\n                list[dtype_object]  # type: ignore\n                if \"list\" in parsed_dtype and parsed_dtype[\"list\"]\n                else dtype_object\n            )\n        return dtype_objects if len(dtype_objects) > 1 else dtype_objects[0]  # type: ignore\n    elif dtype_str.startswith(\"list[\"):\n        # for simple lists, returns list[python_type]\n        dtype_simple_object = _dtype_as_object_simple(\n            dtype_str.removeprefix(\"list[\").removesuffix(\"]\")\n        )\n        return (\n            list[dtype_simple_object] if dtype_simple_object is not None else list  # type: ignore\n        )\n    else:\n        return _dtype_as_object_simple(dtype_str)\n\n\ndef parse_cat_dtype(\n    dtype_str: str,\n    related_registries: dict[str, SQLRecord] | None = None,\n    is_itype: bool = False,\n    check_exists: bool = False,\n    old_format: bool = False,\n) -> dict[str, Any]:\n    \"\"\"Parses a categorical dtype string into its components (registry, field, subtypes).\"\"\"\n    from .artifact import Artifact\n\n    assert isinstance(dtype_str, str)  # noqa: S101\n    if related_registries is None:\n        related_registries = dict_module_name_to_model_name(Artifact)\n\n    # Parse the string considering nested brackets\n    parsed = parse_nested_brackets(dtype_str, old_format=old_format)\n    registry_str = parsed[\"registry\"]\n    filter_str = parsed[\"filter_str\"]\n    field_str = parsed[\"field\"]\n\n    if not is_itype:\n        if registry_str not in related_registries:\n            raise ValidationError(\n                f\"'{registry_str}' is an invalid dtype, has to be registry, e.g. ULabel or bionty.CellType\"\n            )\n        registry = related_registries[registry_str]\n    else:\n        if \".\" in registry_str:\n            registry_str_split = registry_str.split(\".\")\n            assert len(registry_str_split) == 2, registry_str  # noqa: S101\n            module_name_attempt, class_name = registry_str_split\n            module_name = get_schema_module_name(\n                module_name_attempt, raise_import_error=False\n            )\n            if module_name is None:\n                raise ModuleWasntConfigured(\n                    MODULE_WASNT_CONFIGURED_MESSAGE_TEMPLATE.format(\n                        module_name_attempt, module_name_attempt\n                    )\n                )\n        else:\n            module_name, class_name = \"lamindb\", registry_str\n        module = importlib.import_module(module_name)\n        registry = getattr(module, class_name)\n\n    if field_str == \"\":\n        field_str = registry._name_field if hasattr(registry, \"_name_field\") else \"name\"\n    assert hasattr(registry, field_str), f\"{registry} has no field {field_str}\"\n\n    record_uid = parsed.get(\"record_uid\")\n    subtypes_list = parsed.get(\"subtypes_list\")\n\n    # Handle old format (subtypes_list) or new format (record_uid)\n    if subtypes_list and check_exists:\n        # Old format: validate that the Record exists using nested subtypes\n        # subtypes_list is guaranteed to be list[str] when present\n        if isinstance(subtypes_list, list):\n            get_record_type_from_nested_subtypes(\n                registry, cast(list[str], subtypes_list), field_str\n            )\n    elif record_uid and check_exists:\n        get_record_type_from_uid(registry, record_uid)\n\n    if filter_str != \"\":\n        # TODO: validate or process filter string\n        pass\n    result = {\n        \"registry\": registry,  # should be typed as CanCurate\n        \"registry_str\": registry_str,\n        \"filter_str\": filter_str,\n        \"field_str\": field_str,\n        \"field\": getattr(registry, field_str),\n    }\n\n    # Add record_uid if it exists (new format)\n    if record_uid:\n        result[\"record_uid\"] = record_uid\n\n    # Add subtypes_list if it exists (old format)\n    if subtypes_list:\n        result[\"subtypes_list\"] = subtypes_list\n\n    return result\n\n\ndef parse_nested_brackets(dtype_str: str, old_format: bool = False) -> dict[str, Any]:\n    \"\"\"Parse dtype string with potentially nested brackets.\n\n    Examples:\n        \"A\" -> {\"registry\": \"A\", \"filter_str\": \"\", \"field\": \"\"}\n        \"A.field\" -> {\"registry\": \"A\", \"filter_str\": \"\", \"field\": \"field\"}\n        \"Record[abcdefg123456]\" -> {\"registry\": \"Record\", \"filter_str\": \"\", \"field\": \"\", \"record_uid\": \"abcdefg123456\"}\n        \"Record[abcdefg123456].name\" -> {\"registry\": \"Record\", \"filter_str\": \"\", \"field\": \"name\", \"record_uid\": \"abcdefg123456\"}\n        \"bionty.Gene.ensembl_gene_id[source__id='abcd']\" -> {\"registry\": \"bionty.Gene\", \"filter_str\": \"source__id='abcd'\", \"field\": \"ensembl_gene_id\"}\n\n    Args:\n        dtype_str: The dtype string to parse\n\n    Returns:\n        Dictionary with parsed components\n    \"\"\"\n    if \"[\" not in dtype_str:\n        # No brackets - handle simple cases like \"A\" or \"A.field\"\n        if \".\" in dtype_str:\n            parts = dtype_str.split(\".\")\n            if len(parts) == 2 and parts[1][0].isupper():\n                # bionty.CellType\n                return {\"registry\": dtype_str, \"filter_str\": \"\", \"field\": \"\"}\n            elif len(parts) == 3:\n                # bionty.CellType.name\n                return {\n                    \"registry\": f\"{parts[0]}.{parts[1]}\",\n                    \"filter_str\": \"\",\n                    \"field\": parts[2],\n                }\n            else:\n                # ULabel.name\n                return {\"registry\": parts[0], \"filter_str\": \"\", \"field\": parts[1]}\n        else:\n            # Simple registry name\n            return {\"registry\": dtype_str, \"filter_str\": \"\", \"field\": \"\"}\n\n    # Find the first opening bracket\n    first_bracket = dtype_str.index(\"[\")\n    # Handle case where registry_part contains a field (e.g., \"bionty.Gene.ensembl_gene_id[filters]\")\n    registry_and_field = dtype_str[:first_bracket]\n    if \".\" in registry_and_field:\n        parts = registry_and_field.split(\".\")\n        if len(parts) == 3:\n            registry_part = f\"{parts[0]}.{parts[1]}\"\n            pre_bracket_field = parts[2]\n        else:\n            registry_part = registry_and_field\n            pre_bracket_field = \"\"\n    else:\n        registry_part = registry_and_field\n        pre_bracket_field = \"\"\n\n    # Find the matching closing bracket for the first opening bracket\n    bracket_count = 0\n    closing_bracket_pos = -1\n\n    for i in range(first_bracket, len(dtype_str)):\n        if dtype_str[i] == \"[\":\n            bracket_count += 1\n        elif dtype_str[i] == \"]\":\n            bracket_count -= 1\n            if bracket_count == 0:\n                closing_bracket_pos = i\n                break\n\n    if closing_bracket_pos == -1:\n        raise ValueError(f\"Unmatched brackets in dtype string: {dtype_str}\")\n\n    # Extract content between brackets\n    bracket_content = dtype_str[first_bracket + 1 : closing_bracket_pos]\n\n    # Check for field after the closing bracket\n    field_part = \"\"\n    remainder = dtype_str[closing_bracket_pos + 1 :]\n    if remainder.startswith(\".\"):\n        field_part = remainder[1:]  # Remove the dot\n\n    # Use pre_bracket_field if no post_bracket field\n    if not field_part and pre_bracket_field:\n        field_part = pre_bracket_field\n\n    # Extract UID, subtypes_list, or filter from bracket content\n    # For UID-based format: Record[uid] or ULabel[uid] -> record_uid\n    # For old name-based format: Record[Name] or Record[Parent[Child]] -> subtypes_list\n    # For filter format: registry.field[filter] -> filter_str\n    record_uid = None\n    subtypes_list = None\n    filter_str = \"\"\n\n    # If registry is Record or ULabel, bracket content could be UID or name(s)\n    if registry_part in (\"Record\", \"ULabel\"):\n        if bracket_content:\n            if old_format:\n                # Old format with nested brackets like Record[Parent[Child]]\n                extracted = extract_subtypes_and_filter(bracket_content)\n                subtypes_list = extracted[\"subtypes_list\"]\n                filter_str = extracted[\"filter_str\"]\n            else:\n                record_uid = bracket_content\n    else:\n        # For other registries, bracket content is a filter\n        filter_str = bracket_content if bracket_content else \"\"\n\n    result = {\n        \"registry\": registry_part,\n        \"filter_str\": filter_str,\n        \"field\": field_part,\n    }\n\n    # Add record_uid if it exists (new format)\n    if record_uid:\n        result[\"record_uid\"] = record_uid\n\n    # Add subtypes_list if it exists (old format)\n    if subtypes_list:\n        result[\"subtypes_list\"] = subtypes_list\n\n    return result\n\n\ndef extract_subtypes_and_filter(subtype_str: str) -> dict[str, Any]:\n    \"\"\"Extract nested subtypes and optional filter from a nested subtype string.\n\n    Examples:\n        \"B\" -> {\"subtypes_list\": [\"B\"], \"filter_str\": \"\"}\n        \"B[C]\" -> {\"subtypes_list\": [\"B\", \"C\"], \"filter_str\": \"\"}\n        \"B[C[filter='<value>']]\" -> {\"subtypes_list\": [\"B\", \"C\"], \"filter_str\": \"filter='<value>'\"}\n        \"B[C[D]]\" -> {\"subtypes_list\": [\"B\", \"C\", \"D\"], \"filter_str\": \"\"}\n        \"B[C[D[E]]]\" -> {\"subtypes_list\": [\"B\", \"C\", \"D\", \"E\"], \"filter_str\": \"\"}\n        \"B[filter='value']\" -> {\"subtypes_list\": [\"B\"], \"filter_str\": \"filter='value'\"}\n        \"Customer[UScustomer[region='US']]\" -> {\"subtypes_list\": [\"Customer\", \"UScustomer\"], \"filter_str\": \"region='US'\"}\n\n    Args:\n        subtype_str: The subtype string with potential nesting\n\n    Returns:\n        Dictionary with subtypes_list and filter_str\n    \"\"\"\n    subtypes: list[str] = []\n    filter_str = \"\"\n    current = subtype_str\n\n    while current:\n        if \"[\" not in current:\n            # No more brackets\n            if current and \"=\" not in current:\n                # It's a subtype name\n                subtypes.append(current)\n            elif current and \"=\" in current:\n                # It's a filter\n                filter_str = current\n            break\n\n        # Find the first part before the bracket\n        bracket_pos = current.index(\"[\")\n        part = current[:bracket_pos]\n\n        # Add the part (it's a subtype name)\n        if part:\n            subtypes.append(part)\n\n        # Find the matching closing bracket\n        bracket_count = 0\n        closing_pos = -1\n\n        for i in range(bracket_pos, len(current)):\n            if current[i] == \"[\":\n                bracket_count += 1\n            elif current[i] == \"]\":\n                bracket_count -= 1\n                if bracket_count == 0:\n                    closing_pos = i\n                    break\n\n        if closing_pos == -1:\n            break\n\n        # Move to the content inside the brackets\n        current = current[bracket_pos + 1 : closing_pos]\n\n    return {\"subtypes_list\": subtypes, \"filter_str\": filter_str}\n\n\ndef serialize_dtype(\n    dtype: Registry\n    | SQLRecord\n    | FieldAttr\n    | list[SQLRecord]\n    | list[Registry]\n    | list[str]\n    | list[float]\n    | str\n    | type,\n    is_itype: bool = False,\n) -> str:\n    \"\"\"Converts a data type object into its string representation.\"\"\"\n    from .record import Record\n    from .ulabel import ULabel\n\n    # Handle generic types like list[str], list[Registry], etc.\n    if hasattr(dtype, \"__origin__\") and dtype.__origin__ is list:\n        # Get the inner type from list[T]\n        inner_type = dtype.__args__[0] if dtype.__args__ else None  # type: ignore\n        if inner_type is not None:\n            # Recursively serialize the inner type\n            inner_dtype_str = serialize_dtype(inner_type, is_itype=is_itype)\n            return f\"list[{inner_dtype_str}]\"\n\n    if (\n        not isinstance(dtype, list)\n        and hasattr(dtype, \"__name__\")\n        and dtype.__name__ in FEATURE_DTYPES\n    ):\n        dtype_str = dtype.__name__\n    elif dtype is dict:\n        dtype_str = \"dict\"\n    elif is_itype and isinstance(dtype, str):\n        if dtype not in \"Feature\":\n            parse_cat_dtype(\n                dtype_str=dtype, is_itype=True\n            )  # throws an error if invalid\n        dtype_str = dtype\n    else:\n        from pandas.core.dtypes.base import ExtensionDtype\n\n        if isinstance(dtype, (ExtensionDtype, np.dtype)):\n            dtype_str = serialize_pandas_dtype(dtype)\n        else:\n            error_message = \"dtype has to be a registry, a ulabel subtype, a registry field, or a list of registries or fields, not {}\"\n            if isinstance(dtype, (Registry, DeferredAttribute, ULabel, Record)):\n                dtype = [dtype]\n            elif not isinstance(dtype, list):\n                raise ValueError(error_message.format(dtype))\n            dtype_str = \"\"\n            for one_dtype in dtype:\n                if not isinstance(\n                    one_dtype, (Registry, DeferredAttribute, ULabel, Record)\n                ):\n                    raise ValueError(error_message.format(one_dtype))\n                if isinstance(one_dtype, Registry):\n                    dtype_str += one_dtype.__get_name_with_module__() + \"|\"\n                elif isinstance(one_dtype, (ULabel, Record)):\n                    if one_dtype._state.adding:\n                        raise InvalidArgument(\n                            f\"Cannot serialize unsaved objects. Save {one_dtype} via `.save()`.\"\n                        )\n                    if not one_dtype.is_type:\n                        raise InvalidArgument(\n                            f\"Cannot serialize non-type {one_dtype.__class__.__name__} '{one_dtype.name}'. Only types (is_type=True) are allowed in dtypes.\"\n                        )\n                    # Use UID-based format: Record[uid] instead of Record[Parent[Child]]\n                    nested_string = f\"[{one_dtype.uid}]\"\n                    if isinstance(one_dtype, ULabel):\n                        dtype_str += f\"ULabel{nested_string}\"\n                    else:\n                        dtype_str += f\"Record{nested_string}\"\n                else:\n                    name = one_dtype.field.name\n                    field_ext = f\".{name}\" if name != \"name\" else \"\"\n                    dtype_str += (\n                        one_dtype.field.model.__get_name_with_module__()\n                        + field_ext\n                        + \"|\"\n                    )\n            dtype_str = dtype_str.rstrip(\"|\")\n            if not is_itype:\n                dtype_str = f\"cat[{dtype_str}]\"\n    return dtype_str\n\n\ndef serialize_pandas_dtype(pandas_dtype: ExtensionDtype) -> str:\n    \"\"\"Convert pandas ExtensionDtype to simplified string representation.\"\"\"\n    from pandas.api.types import CategoricalDtype, is_string_dtype\n\n    if is_string_dtype(pandas_dtype):\n        if not isinstance(pandas_dtype, CategoricalDtype):\n            dtype = \"str\"\n        else:\n            dtype = \"cat[ULabel]\"\n    # there are string-like categoricals and \"pure\" categoricals (pd.Categorical)\n    elif isinstance(pandas_dtype, CategoricalDtype):\n        dtype = \"cat[ULabel]\"\n    else:\n        # strip precision qualifiers\n        dtype = \"\".join(dt for dt in pandas_dtype.name if not dt.isdigit())\n        if dtype == \"uint\":\n            dtype = \"int\"\n    if dtype.startswith(\"datetime\"):\n        dtype = dtype.split(\"[\")[0]\n    if dtype != \"cat[ULabel]\":\n        assert dtype in FEATURE_DTYPES  # noqa: S101\n    return dtype\n\n\ndef convert_to_pandas_dtype(lamin_dtype: str) -> str | pd.CategoricalDtype:\n    \"\"\"Convert LaminDB simplified string representation back to pandas dtype.\"\"\"\n    from pandas.api.types import CategoricalDtype\n\n    dtype_map = {\n        \"str\": \"string\",  # nullable string dtype\n        \"url\": \"string\",  # URLs are validated as strings\n        \"int\": \"Int64\",  # Nullable integer to handle missing values\n        \"num\": \"float64\",\n        \"float\": \"float64\",\n        \"bool\": \"boolean\",  # Nullable boolean\n        \"datetime\": \"datetime64[ns]\",\n        \"date\": \"object\",  # preserve Date objects\n        \"dict\": \"object\",  # dicts are stored as object dtype in pandas\n    }\n    if lamin_dtype in dtype_map:\n        return dtype_map[lamin_dtype]\n    elif lamin_dtype.startswith(\"cat\"):\n        return CategoricalDtype()\n    elif lamin_dtype.startswith(\"list\"):\n        return \"object\"  # lists are stored as object dtype in pandas\n    return lamin_dtype\n\n\ndef parse_filter_string(filter_str: str) -> dict[str, tuple[str, str | None, str]]:\n    \"\"\"Parse comma-separated Django filter expressions into structured components.\n\n    Args:\n        filter_str: Comma-separated filters like 'name=value, relation__field=value'\n\n    Returns:\n        Dict mapping original filter key to (relation_name, field_name, value) tuple.\n        For direct fields: field_name is None.\n        For relations: field_name contains the lookup field.\n    \"\"\"\n    filters = {}\n\n    filter_parts = [part.strip() for part in filter_str.split(\",\")]\n    for part in filter_parts:\n        if \"=\" not in part:\n            raise ValueError(f\"Invalid filter expression: '{part}' (missing '=' sign)\")\n\n        key, value = part.split(\"=\", 1)\n        key = key.strip()\n        value = value.strip().strip(\"'\\\"\")\n\n        if not key:\n            raise ValueError(f\"Invalid filter expression: '{part}' (empty key)\")\n        if not value:\n            raise ValueError(f\"Invalid filter expression: '{part}' (empty value)\")\n\n        if \"__\" in key:\n            relation_name, field_name = key.split(\"__\", 1)\n            filters[key] = (relation_name, field_name, value)\n        else:\n            filters[key] = (key, None, value)\n\n    return filters\n\n\ndef resolve_relation_filters(\n    parsed_filters: dict[str, tuple[str, str | None, str]], registry: SQLRecord\n) -> dict[str, str | SQLRecord]:\n    \"\"\"Resolve relation filters actual model objects.\n\n    Args:\n        parsed_filters: Django filters like output from :func:`lamindb.models.feature.parse_filter_string`\n        registry: Model class to resolve relationships against\n\n    Returns:\n        Dict with resolved objects for successful relations, original values for direct fields and failed resolutions.\n    \"\"\"\n    resolved = {}\n    for filter_key, (relation_name, field_name, value) in parsed_filters.items():\n        if field_name is not None:  # relation filter\n            if hasattr(registry, relation_name):\n                relation_field = getattr(registry, relation_name)\n                if (\n                    hasattr(relation_field, \"field\")\n                    and relation_field.field.is_relation\n                ):\n                    related_model = relation_field.field.related_model\n                    related_obj = related_model.get(**{field_name: value})\n                    resolved[relation_name] = related_obj\n        else:\n            resolved[filter_key] = value\n    return resolved\n\n\ndef migrate_dtype_to_uid_format(connection, input_field: str = \"_dtype_str\") -> None:\n    \"\"\"Update _dtype_str for nested Record/ULabel types to uid format.\n\n    Converts old format (name-based) dtype strings to new UID-based format.\n    This function is used in migrations to update existing feature records.\n\n    Args:\n        connection: Database connection (from schema_editor.connection)\n        input_field: Field name to read from (\"_dtype_str\" or \"dtype\")\n\n    Returns:\n        None. Updates are performed directly in the database.\n    \"\"\"\n    # Patterns to look for old format (name-based)\n    patterns = [\n        \"cat[Record[\",\n        \"cat[ULabel[\",\n        \"list[cat[Record[\",\n        \"list[cat[ULabel[\",\n    ]\n\n    # Build SQL query to fetch features matching any pattern\n    # Using OR conditions for each pattern\n    pattern_conditions = \" OR \".join(\n        [f\"{input_field} LIKE '{pattern}%'\" for pattern in patterns]\n    )\n\n    query = f\"\"\"\n        SELECT id, uid, name, {input_field}\n        FROM lamindb_feature\n        WHERE {pattern_conditions}\n    \"\"\"\n\n    # Fetch matching features\n    with connection.cursor() as cursor:\n        cursor.execute(query)\n        columns = [col[0] for col in cursor.description]\n        features = [dict(zip(columns, row)) for row in cursor.fetchall()]\n\n    # Convert each feature\n    for feature in features:\n        try:\n            # Convert old format string to objects, then serialize to UID format\n            dtype_objects = dtype_as_object(feature[input_field], old_format=True)\n            new_dtype_str = serialize_dtype(dtype_objects)\n\n            if new_dtype_str != feature[input_field]:\n                # Update using raw SQL\n                update_query = \"\"\"\n                    UPDATE lamindb_feature\n                    SET _dtype_str = %s\n                    WHERE id = %s\n                \"\"\"\n                with connection.cursor() as cursor:\n                    cursor.execute(update_query, [new_dtype_str, feature[\"id\"]])\n\n        except Exception as e:\n            # If conversion fails, keep the original value\n            print(\n                f\"Warning: Could not convert dtype for feature {feature['name']} ({feature['uid']}) because of error: {e}\"\n            )\n            continue\n\n\ndef process_init_feature_param(args, kwargs):\n    # now we proceed with the user-facing constructor\n    if len(args) != 0:\n        raise ValueError(\"Only keyword args allowed\")\n    name: str = kwargs.pop(\"name\", None)\n    dtype: type | str | None = kwargs.pop(\"dtype\", None)\n    is_type: bool = kwargs.pop(\"is_type\", False)\n    type_: Feature | str | None = kwargs.pop(\"type\", None)\n    description: str | None = kwargs.pop(\"description\", None)\n    branch = kwargs.pop(\"branch\", None)\n    branch_id = kwargs.pop(\"branch_id\", 1)\n    space = kwargs.pop(\"space\", None)\n    space_id = kwargs.pop(\"space_id\", 1)\n    _skip_validation = kwargs.pop(\"_skip_validation\", False)\n    if kwargs:\n        valid_keywords = \", \".join([val[0] for val in _get_record_kwargs(Feature)])\n        raise FieldValidationError(f\"Only {valid_keywords} are valid keyword arguments\")\n    kwargs[\"name\"] = name\n    kwargs[\"type\"] = type_\n    kwargs[\"is_type\"] = is_type\n    kwargs[\"branch\"] = branch\n    kwargs[\"branch_id\"] = branch_id\n    kwargs[\"space\"] = space\n    kwargs[\"space_id\"] = space_id\n    kwargs[\"_skip_validation\"] = _skip_validation\n    kwargs[\"description\"] = description\n    # cast dtype\n    if dtype is None and not is_type:\n        raise ValidationError(\n            f\"Please pass dtype, one of {FEATURE_DTYPES} or a composed categorical dtype\"\n        )\n    dtype_str = None\n    if dtype is not None:\n        if not isinstance(dtype, str):\n            dtype_str = serialize_dtype(dtype)\n        elif dtype in {\"num\", \"path\", \"url\"}:\n            dtype_str = dtype\n        else:\n            logger.warning(\n                f\"rather than passing a string '{dtype}' to dtype, consider passing a Python object\"\n            )\n            dtype_str = dtype\n            parse_dtype(dtype_str, check_exists=True, old_format=True)\n            if dtype_str.startswith(\n                (\"cat[Record[\", \"cat[ULabel[\", \"list[cat[Record[\", \"list[cat[ULabel[\")\n            ):\n                # need to convert from old semantic format to new uid-based format\n                dtype_str = serialize_dtype(dtype_as_object(dtype_str, old_format=True))\n        kwargs[\"_dtype_str\"] = dtype_str\n    return kwargs\n\n\nUPDATE_FEATURE_ON_NAME_CHANGE = \"\"\"\\\nDECLARE\n    old_renamed JSONB;\n    new_renamed JSONB;\n    ts TEXT;\nBEGIN\n    -- Only proceed if name actually changed\n    IF OLD.name IS DISTINCT FROM NEW.name THEN\n        -- Update synonyms\n        IF NEW.synonyms IS NULL OR NEW.synonyms = '' THEN\n            NEW.synonyms := OLD.name;\n        ELSIF position(OLD.name in NEW.synonyms) = 0 THEN\n            NEW.synonyms := NEW.synonyms || '|' || OLD.name;\n        END IF;\n\n        -- Update _aux with rename history\n        ts := TO_CHAR(NOW() AT TIME ZONE 'UTC', 'YYYY-MM-DD\"T\"HH24:MI:SS\"Z\"');\n\n        -- Get existing renamed history or initialize empty object\n        old_renamed := COALESCE((OLD._aux->>'renamed')::JSONB, '{}'::JSONB);\n\n        -- Add old name with timestamp\n        new_renamed := old_renamed || jsonb_build_object(ts, OLD.name);\n\n        -- Update _aux with new renamed history\n        IF NEW._aux IS NULL THEN\n            NEW._aux := jsonb_build_object('renamed', new_renamed);\n        ELSE\n            NEW._aux := NEW._aux || jsonb_build_object('renamed', new_renamed);\n        END IF;\n    END IF;\n\n    RETURN NEW;\nEND;\n\"\"\"\n\n\nclass Feature(SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates):\n    \"\"\"Measurable properties such as dataframe columns or record fields.\n\n    Features represent *what* is measured in a dataset—the variables or dimensions along which data is organized.\n    They enable you to query datasets based on their structure and corresponding label annotations.\n\n    Args:\n        name: `str` Name of the feature, typically a column name.\n        dtype: `type | ULabel | Record | DtypeStr | Registry | list[Registry] | FieldAttr`\n            Types or `ULabel` or `Record` objects representing types.\n            See :class:`~lamindb.base.types.DtypeStr`.\n        type: `Feature | None = None` A feature type, see :attr:`~lamindb.Feature.type`.\n        is_type: `bool = False` Whether this feature is a type, see :attr:`~lamindb.Feature.is_type`.\n        unit: `str | None = None` Unit of measure, ideally SI (`\"m\"`, `\"s\"`, `\"kg\"`, etc.) or `\"normalized\"` etc.\n        description: `str | None = None` A description.\n        synonyms: `str | None = None` Bar-separated synonyms.\n        nullable: `bool = True` Whether the feature can have null-like values (`None`, `pd.NA`, `NaN`, etc.), see :attr:`~lamindb.Feature.nullable`.\n        default_value: `Any | None = None` Default value for the feature.\n        coerce: `bool | None = None` When `True`, attempts to coerce values to the specified dtype during validation, see :attr:`~lamindb.Feature.coerce`.\n            Defaults to `False` unless `is_type` is `True`.\n        cat_filters: `dict[str, str | SQLRecord] | None = None` Subset a registry by additional filters to define valid categories.\n\n    Note:\n\n        For more control, you can use :mod:`bionty` registries to manage simple\n        biological entities like genes, proteins & cell markers. Or you define\n        custom registries to manage high-level derived features like gene sets.\n\n    See Also:\n        :meth:`~lamindb.Feature.from_dataframe`\n            Create feature records from DataFrame.\n        :attr:`~lamindb.Artifact.features`\n            Feature manager of an artifact or collection.\n        :class:`~lamindb.ULabel`\n            Universal labels.\n        :class:`~lamindb.Schema`\n            Sets of features.\n\n    Example:\n\n        Features with simple data types::\n\n            ln.Feature(name=\"sample_note\", dtype=str).save()\n            ln.Feature(name=\"temperature_in_celsius\", dtype=float).save()\n            ln.Feature(name=\"read_count\", dtype=int).save()\n\n        A categorical feature measuring labels managed in the `ULabel` registry::\n\n            ln.Feature(name=\"sample\", dtype=ln.ULabel).save()\n\n        Restrict a categorical feature to a specific `ULabel` type::\n\n            perturbation = ln.ULabel(name=\"Perturbation\", is_type=True).save()\n            ln.Feature(name=\"perturbation\", dtype=perturbation).save()\n\n        Restrict a categorical feature to a specific `Record` type::\n\n            experiment = ln.Record(name=\"Experiment\", is_type=True).save()\n            ln.Feature(name=\"experiment\", dtype=experiment).save()\n\n        Restrict a categorical feature to the `bt.CellType` registry::\n\n            ln.Feature(name=\"cell_type_by_expert\", dtype=bt.CellType).save()  # expert annotation\n            ln.Feature(name=\"cell_type_by_model\", dtype=bt.CellType).save()   # model annotation\n\n        .. admonition:: Categoricals define relationships.\n\n            In LaminDB, **categoricals** define **relationships**.\n            For example, with dtype set to a `ULabel` type, setting a feature value relates the object to a `ULabel` of that type.\n\n        Scope a feature with a **feature type** to distinguish the same feature name across different contexts::\n\n            abc_feature_type = ln.Feature(name=\"ABC\", is_type=True).save()  # ABC could reference a schema, a project, a team, etc.\n            ln.Feature(name=\"concentration_nM\", dtype=float, type=abc_feature_type).save()\n\n            xyz_feature_type = ln.Feature(name=\"XYZ\", is_type=True).save()  # XYZ could reference a schema, a project, a team, etc.\n            ln.Feature(name=\"concentration_nM\", dtype=float, type=xyz_feature_type).save()\n\n            # calling .save() again with the same name and type returns the existing feature\n            ln.Feature(name=\"concentration_nM\", dtype=float, type=xyz_feature_type).save()\n\n        Annotate an artifact with features (works identically for records and runs)::\n\n            artifact.features.set_values({\n                \"temperature_in_celsius\": 37.5,\n                \"sample_note\": \"Control sample\",\n            })\n\n        Query artifacts/records/runs by features::\n\n            ln.Artifact.filter(features__name=\"temperature_in_celsius\")  # artifacts with this feature\n            ln.Artifact.filter(temperature_in_celsius__gt=37)            # artifacts where temperature > 37\n\n        Disambiguate duplicate feature names by querying with a `Feature` object::\n\n            feature = ln.Feature.get(name=\"my_ambig_name\", type__name=\"my_feature_type\")\n            ln.Artifact.filter(feature == \"hello\")  # instead of my_ambig_name=\"hello\"\n\n        A list dtype::\n\n            ln.Feature(\n                name=\"cell_types\",\n                dtype=list[bt.CellType],  # or list[str] for a list of strings\n            ).save()\n\n        A path feature::\n\n            ln.Feature(\n                name=\"image_path\",\n                dtype=\"path\",   # will be validated as `str`\n            ).save()\n\n        Restrict categories via filters::\n\n            # restrict diseases to those matching a specific ontology version\n            source = bt.Source.get(name=\"My ontology\")  # a registry for ontology versions\n            ln.Feature(\n                name=\"disease\",\n                dtype=bt.Disease,\n                cat_filters={\"source\": source},\n            ).save()\n\n            # restrict artifacts to those matching a specific schema\n            schema = ln.Schema.get(name=\"my-schema\")\n            ln.Feature(\n                name=\"valid_artifact\",\n                dtype=ln.Artifact,\n                cat_filters={\"schema\": schema},\n            ).save()\n\n        A feature accepting multiple categorical types - a union type::\n\n            ln.Feature(\n                name=\"cell_types\",\n                dtype=\"cat[bionty.Tissue.ontology_id|bionty.CellType.ontology_id]\"\n            ).save()\n\n    .. dropdown:: What is the difference between features and labels?\n\n        1. A feature qualifies what is measured, i.e., a numerical or categorical random variable\n        2. A label *is* a measured value of a categorical variable, i.e., a category\n\n        Example: When annotating a dataset that measures expression of 30k genes,\n        the gene identifiers serve as feature identifiers, and the features are expression measurements for these genes.\n        When annotating a dataset whose experiment knocked out 3 specific genes, those genes serve as labels of the dataset.\n\n        Re-shaping data can introduce ambiguity among features & labels. If this\n        happened, ask yourself what the joint measurement was: a feature\n        qualifies variables in a joint measurement. The canonical data matrix\n        lists jointly measured variables in the columns.\n    \"\"\"\n\n    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):\n        abstract = False\n        app_label = \"lamindb\"\n        if (\n            django_settings.DATABASES.get(\"default\", {}).get(\"ENGINE\")\n            == \"django.db.backends.postgresql\"\n        ):\n            triggers = [\n                pgtrigger.Trigger(\n                    name=\"update_feature_on_name_change\",\n                    operation=pgtrigger.Update,\n                    when=pgtrigger.Before,\n                    condition=pgtrigger.Condition(\"OLD.name IS DISTINCT FROM NEW.name\"),\n                    func=UPDATE_FEATURE_ON_NAME_CHANGE,\n                ),\n            ]\n        constraints = [\n            models.CheckConstraint(\n                condition=models.Q(is_type=True) | models.Q(_dtype_str__isnull=False),\n                name=\"feature_dtype_str_not_null_when_is_type_false\",\n            ),\n            # also see raw SQL constraints for `is_type` and `type` FK validity in migrations\n        ]\n\n    # Keep Django model hash/equality semantics for model identity use-cases.\n    __hash__ = SQLRecord.__hash__\n\n    _name_field: str = \"name\"\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(\n        editable=False, unique=True, db_index=True, max_length=12, default=base62_12\n    )\n    \"\"\"Universal id, valid across DB instances.\"\"\"\n    name: str = CharField(max_length=150, db_index=True)\n    \"\"\"Name of feature.\"\"\"\n    _dtype_str: DtypeStr | str | None = CharField(db_index=True, null=True)\n    \"\"\"The string-serialized data type (:class:`~lamindb.base.types.DtypeStr`).\n\n    Note that mutating this field currently does not trigger re-validation of existing values.\n    \"\"\"\n    type: Feature | None = ForeignKey(\n        \"self\", PROTECT, null=True, related_name=\"features\"\n    )\n    \"\"\"Type of feature (e.g., 'Readout', 'Metric', 'Metadata', 'ExpertAnnotation', 'ModelPrediction').\n\n    Allows to group features by type, e.g., all read outs, all metrics, etc.\n    \"\"\"\n    features: Feature\n    \"\"\"Features of this type (can only be non-empty if `is_type` is `True`).\"\"\"\n    unit: str | None = CharField(max_length=30, db_index=True, null=True)\n    \"\"\"Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional).\"\"\"\n    description: str | None = TextField(null=True)\n    \"\"\"A description.\"\"\"\n    array_rank: int = models.SmallIntegerField(default=0, db_index=True)\n    \"\"\"Rank of feature.\n\n    Number of indices of the array: 0 for scalar, 1 for vector, 2 for matrix.\n\n    Is called `.ndim` in `numpy` and `pytorch` but shouldn't be confused with\n    the dimension of the feature space.\n    \"\"\"\n    array_size: int = models.IntegerField(default=0, db_index=True)\n    \"\"\"Number of elements of the feature.\n\n    Total number of elements (product of shape components) of the array.\n\n    - A number or string (a scalar): 1 or `None`\n    - A 50-dimensional embedding: 50\n    - A 25 x 25 image: 625\n    \"\"\"\n    array_shape: list[int] | None = JSONField(default=None, db_default=None, null=True)\n    \"\"\"Shape of the feature.\n\n    - A number or string (a scalar): [1] or `None`\n    - A 50-dimensional embedding: [50]\n    - A 25 x 25 image: [25, 25]\n\n    Is stored as a list rather than a tuple because it's serialized as JSON.\n    \"\"\"\n    synonyms: str | None = TextField(null=True)\n    \"\"\"Bar-separated (|) synonyms (optional).\"\"\"\n    default_value: Any | None = JSONField(null=True, default=None)\n    \"\"\"A default value that overwrites missing values during standardization.\"\"\"\n    nullable: bool | None = BooleanField(null=True, default=None)\n    \"\"\"Whether the feature can have nullable values. None for type-like features.\"\"\"\n    coerce: bool | None = BooleanField(null=True, default=None)\n    \"\"\"Whether dtypes should be coerced during validation. None for type-like features.\"\"\"\n    # we define the below ManyToMany on the Feature model because it parallels\n    # how other registries (like Gene, Protein, etc.) relate to Schema\n    schemas: RelatedManager[Schema] = models.ManyToManyField(\n        \"Schema\", through=\"SchemaFeature\", related_name=\"features\"\n    )\n    \"\"\"Schemas linked to this feature.\"\"\"\n    values: RelatedManager[JsonValue]\n    \"\"\"Values for this feature.\"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"Annotating projects.\"\"\"\n    ablocks: RelatedManager[FeatureBlock]\n    \"\"\"Attached blocks ← :attr:`~lamindb.FeatureBlock.feature`.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        name: str,\n        dtype: DtypeStr | ULabel | Record | Registry | list[Registry] | FieldAttr,\n        type: Feature | None = None,\n        is_type: bool = False,\n        unit: str | None = None,\n        description: str | None = None,\n        synonyms: str | None = None,\n        nullable: bool | None = None,\n        default_value: Any | None = None,\n        coerce: bool | None = None,\n        cat_filters: dict[str, str] | None = None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        if len(args) == len(self._meta.concrete_fields):\n            super().__init__(*args, **kwargs)\n            return None\n        default_value = kwargs.pop(\"default_value\", None)\n        nullable = kwargs.pop(\"nullable\", None)\n        # Default nullable to True for non-type features\n        is_type = kwargs.get(\"is_type\", False)\n        if nullable is None and not is_type:\n            nullable = True\n        cat_filters = kwargs.pop(\"cat_filters\", None)\n        if \"coerce_dtype\" in kwargs:\n            warnings.warn(\n                \"`coerce_dtype` argument was renamed to `coerce` and will be removed in a future release.\",\n                DeprecationWarning,\n                stacklevel=2,\n            )\n            coerce = kwargs.pop(\"coerce_dtype\")\n        else:\n            coerce = kwargs.pop(\"coerce\", None)\n        kwargs = process_init_feature_param(args, kwargs)\n        super().__init__(*args, **kwargs)\n        self.default_value = default_value\n        self.nullable = nullable\n        self.coerce = coerce\n        dtype_str = kwargs.pop(\"_dtype_str\", None)\n        if dtype_str == \"cat\":\n            warnings.warn(\n                \"dtype `cat` is deprecated and will be removed in the future - \"\n                \"please use `ln.Record` or `ln.ULabel` instead\",\n                DeprecationWarning,\n                stacklevel=2,\n            )\n        if cat_filters:\n            if \"|\" in dtype_str:\n                raise ValidationError(\n                    f\"cat_filters are incompatible with union dtypes: '{dtype_str}'\"\n                )\n            if \"]]\" in dtype_str:\n                raise ValidationError(\n                    f\"cat_filters are incompatible with nested dtypes: '{dtype_str}'\"\n                )\n\n            # Validate filter values and SQLRecord attributes\n            for filter_key, filter_value in cat_filters.items():\n                if not filter_value or (\n                    isinstance(filter_value, str) and not filter_value.strip()\n                ):\n                    raise ValidationError(f\"Empty value in filter {filter_key}\")\n                # Check SQLRecord attributes for relation lookups\n                if isinstance(filter_value, SQLRecord) and \"__\" in filter_key:\n                    field_name = filter_key.split(\"__\", 1)[1]\n                    if not hasattr(filter_value, field_name):\n                        raise ValidationError(\n                            f\"SQLRecord {filter_value.__class__.__name__} has no attribute '{field_name}' in filter {filter_key}\"\n                        )\n\n            # If a SQLRecord is passed, we access its uid to apply a standard filter\n            cat_filters = {\n                f\"{key}__uid\"\n                if (\n                    is_sqlrecord := isinstance(filter, SQLRecord)\n                    and hasattr(filter, \"uid\")\n                )\n                else key: filter.uid if is_sqlrecord else filter\n                for key, filter in cat_filters.items()\n            }\n\n            fill_in = \", \".join(\n                f\"{key}='{value}'\" for (key, value) in cat_filters.items()\n            )\n            dtype_str = dtype_str.replace(\"]\", f\"[{fill_in}]]\")\n            self._dtype_str = dtype_str\n        if not self._state.adding:\n            if self._dtype_str != dtype_str:\n                raise ValidationError(\n                    f\"Feature {self.name} already exists with dtype {self._dtype_str}, you passed {dtype_str}\"\n                )\n\n    def __eq__(self, other: object) -> bool:\n        # Preserve model identity semantics only for Feature-to-Feature comparisons.\n        if isinstance(other, Feature):\n            return super().__eq__(other)\n        # Runtime returns a predicate object for query composition.\n        # Cast keeps mypy-compatible override with object.__eq__ -> bool.\n        return cast(bool, FeaturePredicate(self, \"\", other))\n\n    def __ne__(self, other: object) -> bool:\n        # Preserve model identity semantics only for Feature-to-Feature comparisons.\n        if isinstance(other, Feature):\n            return not super().__eq__(other)\n        # Runtime returns a predicate object for query composition.\n        # Cast keeps mypy-compatible override with object.__ne__ -> bool.\n        return cast(bool, FeaturePredicate(self, \"__ne\", other))\n\n    def __gt__(self, value: Any) -> FeaturePredicate:\n        return FeaturePredicate(self, \"__gt\", value)\n\n    def __ge__(self, value: Any) -> FeaturePredicate:\n        return FeaturePredicate(self, \"__gte\", value)\n\n    def __lt__(self, value: Any) -> FeaturePredicate:\n        return FeaturePredicate(self, \"__lt\", value)\n\n    def __le__(self, value: Any) -> FeaturePredicate:\n        return FeaturePredicate(self, \"__lte\", value)\n\n    # manually sync this docstring across all other children of HasType\n    def query_features(self) -> QuerySet:\n        \"\"\"Query features of sub types.\n\n        While `.features` retrieves the features with the current type, this method\n        also retrieves sub types and the features with sub types of the current type.\n        \"\"\"\n        return _query_relatives([self], \"features\")  # type: ignore\n\n    @classmethod\n    def from_dataframe(\n        cls, df: pd.DataFrame, field: FieldAttr | None = None, *, mute: bool = False\n    ) -> SQLRecordList:\n        \"\"\"Create Feature records for dataframe columns.\n\n        Args:\n            df: Source DataFrame to extract column information from\n            field: FieldAttr for Feature model validation, defaults to Feature.name\n            mute: Whether to mute Feature creation similar names found warnings\n        \"\"\"\n        from lamindb.models import ULabel\n\n        field = Feature.name if field is None else field\n        registry = field.field.model  # type: ignore\n        if registry != Feature:\n            raise ValueError(\"field must be a Feature FieldAttr!\")\n\n        categoricals = categoricals_from_df(df)\n        dtypes: dict[str, type | SQLRecord | FieldAttr] = {}\n        for name, col in df.items():\n            if name in categoricals:\n                dtypes[name] = ULabel\n            else:\n                dtype_str = serialize_pandas_dtype(col.dtype)\n                dtypes[name] = dtype_as_object(dtype_str)\n\n        if mute:\n            original_verbosity = logger._verbosity\n            logger.set_verbosity(0)\n        try:\n            features = [\n                Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()\n            ]  # type: ignore\n            assert len(features) == len(df.columns)  # noqa: S101\n            return SQLRecordList(features)\n        finally:\n            if mute:\n                logger.set_verbosity(original_verbosity)\n\n    @classmethod\n    @deprecated(\"from_dataframe\")\n    def from_df(\n        cls, df: pd.DataFrame, field: FieldAttr | None = None, *, mute: bool = False\n    ) -> SQLRecordList:\n        return cls.from_dataframe(df, field, mute=mute)\n\n    @classmethod\n    def from_dict(\n        cls,\n        dictionary: dict[str, Any],\n        field: FieldAttr | None = None,\n        *,\n        type: Feature | None = None,\n        mute: bool = False,\n    ) -> SQLRecordList:\n        \"\"\"Create Feature records for dictionary keys.\n\n        Args:\n            dictionary: Source dictionary to extract key information from\n            field: FieldAttr for Feature model validation, defaults to `Feature.name`\n            type: Feature type of all created features\n            mute: Whether to mute dtype inference and feature creation warnings\n        \"\"\"\n        from lamindb.models._feature_manager import infer_convert_dtype_key_value\n\n        field = Feature.name if field is None else field\n        registry = field.field.model  # type: ignore\n        if registry != Feature:\n            raise ValueError(\"field must be a Feature FieldAttr!\")\n\n        dtypes = {}\n        for key, value in dictionary.items():\n            dtype, _, message = infer_convert_dtype_key_value(key, value, mute=mute)\n            if dtype == \"cat ? str\":\n                dtype = \"str\"\n            elif dtype == \"list[cat ? str]\":\n                dtype = \"list[str]\"\n            dtypes[key] = dtype\n\n        if mute:\n            original_verbosity = logger._verbosity\n            logger.set_verbosity(0)\n        try:\n            features = [\n                Feature(name=key, dtype=dtype, type=type)\n                for key, dtype in dtypes.items()\n            ]  # type: ignore\n            assert len(features) == len(dictionary)  # noqa: S101\n            return SQLRecordList(features)\n        finally:\n            if mute:\n                logger.set_verbosity(original_verbosity)\n\n    def save(self, *args, **kwargs) -> Feature:\n        \"\"\"Save the feature to the instance.\"\"\"\n        super().save(*args, **kwargs)\n        return self\n\n    def with_config(self, optional: bool | None = None) -> tuple[Feature, dict]:\n        \"\"\"Pass addtional configurations to the schema.\"\"\"\n        if optional is not None:\n            return self, {\"optional\": optional}\n        return self, {}\n\n    @property\n    @deprecated(\"coerce\")\n    def coerce_dtype(self) -> bool | None:\n        \"\"\"Alias for coerce (backward compatibility).\"\"\"\n        return self.coerce\n\n    @coerce_dtype.setter\n    def coerce_dtype(self, value: bool | None) -> None:\n        self.coerce = value\n\n    @property\n    @deprecated(\"dtype_as_str\")\n    def dtype(self) -> str | None:\n        \"\"\"The `dtype` as a string.\"\"\"\n        if self._dtype_str is None:\n            return None\n        if self._dtype_str.startswith(\n            (\"cat[Record[\", \"cat[ULabel[\", \"list[cat[Record[\", \"list[cat[ULabel[\")\n        ):\n            if self._dtype_str.startswith(\"list[\"):\n                dtype_str = self._dtype_str.replace(\"list[\", \"\")[:-1]\n            else:\n                dtype_str = self._dtype_str\n            record_object = dtype_as_object(dtype_str)\n            nested_string = f\"[{record_object.name}]\"  # type: ignore\n            for t in record_object.query_types():  # type: ignore\n                nested_string = f\"[{t.name}{nested_string}]\"\n            return self._dtype_str.replace(f\"[{record_object.uid}]\", nested_string)  # type: ignore\n        else:\n            return self._dtype_str\n\n    @property\n    def dtype_as_str(self) -> DtypeStr | str | None:\n        \"\"\"The `dtype` as a string.\n\n        You can query by this property as if it was a string field. The query is delegated to the private `_dtype_str` field.\n\n        Is `None` if `Feature` if `is_type=True`, otherwise a string.\n\n        Examples:\n\n            Query by `dtype_as_str`::\n\n                ln.Feature.filter(dtype_as_str=\"float\").to_dataframe()\n\n            Examples for `dtype_as_str`::\n\n                feature_float = ln.Feature(name=\"measurement\", dtype=float).save()\n                assert feature_float.dtype_as_str == \"float\"\n\n                sample_type = bt.Record(name=\"Sample\", is_type=True).save()\n                feature_sample = ln.Feature(name=\"sample\", dtype=sample_type).save()\n                assert feature_sample.dtype_as_str == \"cat[Record[12345678abcdeFGHI]]  # uid of type record\n\n                feature_list_float = ln.Feature(name=\"numbers\", dtype=list[float]).save()\n                assert feature_list_float.dtype_as_str == \"list[float]\"\n\n                feature_ulabel = ln.Feature(name=\"sample\", dtype=ln.ULabel).save()\n                assert feature_ulabel.dtype_as_str == \"cat[ULabel]\"\n\n                feature_record = ln.Feature(name=\"sample\", dtype=bt.CellLine).save()\n                assert feature_record.dtype_as_str == \"cat[bionty.CellLine]\"\n\n                feature_list_record = ln.Feature(name=\"cell_types\", dtype=list[bt.CellLine]).save()\n                assert feature_list_record.dtype_as_str == \"list[cat[bionty.CellLine]]\"\n        \"\"\"\n        return self._dtype_str\n\n    @property\n    def dtype_as_object(self) -> type | SQLRecord | FieldAttr | None:  # type: ignore\n        \"\"\"The `dtype` as an object.\n\n        Example:\n\n            For simple dtypes, returns the built-in Python type::\n\n                feature_float = ln.Feature(name=\"measurement\", dtype=float).save()\n                assert feature_float.dtype_as_object is float\n\n            For features with with `Record` or `ULabel` types, returns the `Record` or `ULabel` object::\n\n                sample_type = bt.Record(name=\"Sample\", is_type=True).save()\n                feature_sample = ln.Feature(name=\"sample\", dtype=sample_type).save()\n                assert feature_sample.dtype_as_object == sample_type\n\n            For features with `Registry` types, returns the `Registry` object or a field (`DeferredAttribute`) object::\n\n                feature_cell_type = ln.Feature(name=\"cell_type_name\", dtype=bt.CellType).save()\n                assert feature_cell_type.dtype_as_object == bt.CellType\n                feature_ontology_id = ln.Feature(name=\"ontology_id\", dtype=bt.CellType.ontology_id).save()\n                assert feature_ontology_id.dtype_as_object == bt.CellType.ontology_id\n\n        \"\"\"\n        return dtype_as_object(self._dtype_str)\n\n    # we'll enable this later\n    # @property\n    # def observational_unit(self) -> Literal[\"Artifact\", \"Observation\"]:\n    #     \"\"\"Default observational unit on which the feature is measured.\n\n    #     Currently, we only make a distinction between artifact-level and observation-level features.\n\n    #     For example, a feature `\"ml_split\"` that stores `\"test\"` & `\"train\"` labels is typically defined on the artifact level.\n    #     When accessing `artifact.features.get_values([\"ml_split\"])`, you expect a single value, either `\"test\"` or `\"train\"`.\n\n    #     However, when accessing an artifact annotation with a feature that's defined on the observation-level, say `\"cell_type\"`, you expect a set of values. So,\n    #     `artifact.features.get_values([\"cell_type_from_expert\"])` should return a set: `{\"T cell\", \"B cell\"}`.\n\n    #     The value of `observational_unit` is currently auto-managed: if using `artifact.features.set_values()`,\n    #     it will be set to `Artifact`. In a curator, the value depends on whether it's an artifact- or observation-level slot\n    #     (e.g. `.uns` is artifact-level in `AnnData` whereas `.obs` is observation-level).\n\n    #     Note: This attribute might in the future be used to distinguish different types of observational units (e.g. single cells vs. physical samples vs. study subjects etc.).\n    #     \"\"\"\n    #     if self._expect_many:\n    #         return \"Observation\"  # this here might be replaced with the specific observational unit\n    #     else:\n    #         return \"Artifact\"\n\n\nclass JsonValue(SQLRecord, TracksRun):\n    \"\"\"JSON values for annotating artifacts and runs.\n\n    Categorical values are stored in their respective registries:\n    :class:`~lamindb.ULabel`, :class:`~bionty.CellType`, etc.\n\n    Unlike for `ULabel`, in `JsonValue`, values are grouped by features and\n    not by an ontological hierarchy.\n    \"\"\"\n\n    # we do not have a unique constraint on feature & value because it leads to hashing errors\n    # for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000\n    # we do not hash values because we have `get_or_create` logic all over the place\n    # and also for checking whether the (feature, value) combination exists\n    # there does not seem an issue with querying for a dict-like value\n    # https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001\n\n    _name_field: str = \"value\"\n\n    feature: Feature | None = ForeignKey(\n        Feature, CASCADE, null=True, related_name=\"values\", default=None\n    )\n    \"\"\"The dimension metadata.\"\"\"\n    value: Any = models.JSONField()\n    \"\"\"The JSON-like value.\"\"\"\n    hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True)\n    \"\"\"Value hash.\"\"\"\n    artifacts: Artifact\n    \"\"\"Artifacts annotated with this feature value.\"\"\"\n    runs: Run\n    \"\"\"Runs annotated with this feature value.\"\"\"\n\n    class Meta(BaseSQLRecord.Meta, TracksRun.Meta):\n        app_label = \"lamindb\"\n        unique_together = (\"feature\", \"hash\")\n\n    @classmethod\n    def get_or_create(cls, feature, value):\n        # simple values: (int, float, str, bool, datetime)\n        if not isinstance(value, dict):\n            hash = hash_string(str(value))\n        else:\n            hash = hash_dict(value)\n        try:\n            return (\n                cls.objects.create(feature=feature, value=value, hash=hash),\n                False,\n            )\n        except DjangoIntegrityError:\n            return cls.objects.get(feature=feature, hash=hash), True\n\n\ndef suggest_categorical_for_str_iterable(\n    iterable: Iterable[str], key: str = None\n) -> str:\n    import pandas as pd\n\n    c = pd.Categorical(iterable)\n    message = \"\"\n    if len(c.categories) < len(c):\n        if key != \"\":\n            key_note = f\" for feature {key}\"\n        else:\n            key_note = \"\"\n        message = f\"You have few permissible values{key_note}, consider dtype 'cat' instead of 'str'\"\n    return message\n\n\ndef categoricals_from_df(df: pd.DataFrame) -> dict:\n    \"\"\"Returns categorical columns.\"\"\"\n    from pandas.api.types import CategoricalDtype, is_string_dtype\n\n    string_cols = [col for col in df.columns if is_string_dtype(df[col])]\n    categoricals = {\n        col: df[col]\n        for col in df.columns\n        if isinstance(df[col].dtype, CategoricalDtype)\n    }\n    for key in string_cols:\n        message = suggest_categorical_for_str_iterable(df[key], key)\n        if message:\n            logger.warning(message)\n    return categoricals\n"
  },
  {
    "path": "lamindb/models/has_parents.py",
    "content": "# ruff: noqa: TC004\nfrom __future__ import annotations\n\nimport builtins\nfrom typing import TYPE_CHECKING, Literal\n\nimport lamindb_setup as ln_setup\nfrom lamin_utils import logger\n\nfrom ..errors import ValidationError\nfrom .query_set import SQLRecordList, get_default_branch_ids\nfrom .run import Run\nfrom .sqlrecord import HasType, format_field_value, get_name_field\n\nif TYPE_CHECKING:\n    from graphviz import Digraph\n\n    from lamindb.base.types import StrField\n\n    from .artifact import Artifact\n    from .collection import Collection\n    from .query_set import BasicQuerySet, QuerySet\n    from .sqlrecord import SQLRecord\n\nLAMIN_GREEN_LIGHTER = \"#10b981\"\nLAMIN_GREEN_DARKER = \"#065f46\"\nTRANSFORM_VIOLET = \"#eff2ff\"\nGREEN_FILL = \"honeydew\"\nis_run_from_ipython = getattr(builtins, \"__IPYTHON__\", False)\n\n\n# this is optimized to have fewer recursive calls\n# also len of QuerySet can be costly at times\ndef _query_relatives(\n    records: BasicQuerySet | list[HasParents],\n    attr: Literal[\"children\", \"parents\"] | str,\n) -> QuerySet:\n    branch_ids = get_default_branch_ids()\n\n    if hasattr(records, \"values_list\"):\n        model = records.model  # type: ignore\n        using_db = records.db  # type: ignore\n        frontier_ids = set(records.values_list(\"id\", flat=True))\n    else:\n        record = records[0]\n        model = record.__class__\n        using_db = record._state.db  # type: ignore\n        frontier_ids = {r.id for r in records}  # type: ignore\n\n    if attr == \"children\":\n        attr_filter = \"parents__id__in\"\n    elif attr == \"parents\":\n        attr_filter = \"children__id__in\"\n    else:\n        attr_filter = \"type__id__in\"\n\n    seen_ids = set(frontier_ids)  # copies\n    results = set()\n\n    while frontier_ids:\n        relatives_qs = model.connect(using_db).filter(\n            branch_id__in=branch_ids, **{attr_filter: frontier_ids}\n        )\n        next_ids = set(relatives_qs.values_list(\"id\", flat=True)) - seen_ids\n        if not next_ids:\n            break\n        results.update(next_ids)\n        seen_ids.update(next_ids)\n        frontier_ids = next_ids\n\n    return model.connect(using_db).filter(id__in=results)\n\n\ndef keep_topmost_matches(records: list[HasType] | SQLRecordList) -> SQLRecordList:\n    \"\"\"Keep only the topmost (least specific) match.\"\"\"\n    if not records:\n        return SQLRecordList([])\n\n    # Group by name\n    records_by_name: dict[str, list[HasType]] = {}\n    for record in records:\n        if record.name not in records_by_name:\n            records_by_name[record.name] = []\n        records_by_name[record.name].append(record)\n\n    # Fast path: single match per name\n    result: SQLRecordList = SQLRecordList([])\n    needs_depth_computation = {}\n\n    for name, name_records in records_by_name.items():\n        if len(name_records) == 1:\n            result.append(name_records[0])\n        else:\n            # Check if any have type_id=None (trivially topmost)\n            root_records = [r for r in name_records if r.type_id is None]\n            if len(root_records) == 1:\n                result.append(root_records[0])\n            elif len(root_records) > 1:\n                class_name = records[0].__class__.__name__\n                raise ValidationError(\n                    f\"Ambiguous match for {class_name} '{name}': found {len(root_records)} \"\n                    f\"root-level {class_name.lower()}s\"\n                )\n            else:\n                # All have type_id, need depth computation\n                needs_depth_computation[name] = name_records\n\n    # Only compute depths if necessary\n    if needs_depth_computation:\n\n        def get_depth(record):\n            current_type = record.type\n            depth = 1\n            while current_type.type_id is not None:\n                current_type = current_type.type\n                depth += 1\n            return depth\n\n        for name, name_records in needs_depth_computation.items():\n            records_with_depth = [(r, get_depth(r)) for r in name_records]\n            min_depth = min(depth for _, depth in records_with_depth)\n            topmost = [r for r, depth in records_with_depth if depth == min_depth]\n            class_name = records[0].__class__.__name__\n            if len(topmost) > 1:\n                raise ValidationError(\n                    f\"Ambiguous match for {class_name} '{name}': found {len(topmost)} {class_name.lower()}s \"\n                    f\"at depth {min_depth} (under types: {[r.type.name for r in topmost]})\"\n                )\n\n            result.append(topmost[0])\n\n    return result\n\n\ndef _query_ancestors_of_fk(record: SQLRecord, attr: str) -> SQLRecordList:\n    from .query_set import get_default_branch_ids\n\n    branch_ids = get_default_branch_ids()\n    ancestors = []\n\n    current = getattr(record, attr)\n    while current is not None and current.branch_id in branch_ids:\n        ancestors.append(current)\n        current = getattr(current, attr)\n\n    return SQLRecordList(ancestors)\n\n\nclass HasParents:\n    \"\"\"Base class for hierarchical registries (ontologies).\"\"\"\n\n    def view_parents(\n        self,\n        field: StrField | None = None,\n        with_children: bool = False,\n        distance: int = 5,\n    ):\n        \"\"\"View parents in an ontology.\n\n        Args:\n            field: Field to display on graph\n            with_children: Whether to also show children.\n            distance: Maximum distance still shown.\n\n        Ontological hierarchies: :class:`~lamindb.ULabel` (project & sub-project), :class:`~bionty.CellType` (cell type & subtype).\n\n        Examples:\n            >>> import bionty as bt\n            >>> bt.Tissue.from_source(name=\"subsegmental bronchus\").save()\n            >>> record = bt.Tissue.get(name=\"respiratory tube\")\n            >>> record.view_parents()\n            >>> tissue.view_parents(with_children=True)\n        \"\"\"\n        if field is None:\n            field = get_name_field(self)\n        if not isinstance(field, str):\n            field = field.field.name\n\n        return view_parents(\n            record=self,  # type: ignore\n            field=field,\n            with_parents=True,\n            with_children=with_children,\n            distance=distance,\n        )\n\n    def view_children(\n        self,\n        field: StrField | None = None,\n        distance: int = 5,\n    ):\n        \"\"\"View children in an ontology.\n\n        Args:\n            field: Field to display on graph\n            distance: Maximum distance still shown.\n\n        Ontological hierarchies: :class:`~lamindb.ULabel` (project & sub-project), :class:`~bionty.CellType` (cell type & subtype).\n\n        Examples:\n            >>> import bionty as bt\n            >>> bt.Tissue.from_source(name=\"subsegmental bronchus\").save()\n            >>> record = bt.Tissue.get(name=\"respiratory tube\")\n            >>> record.view_parents()\n            >>> tissue.view_parents(with_children=True)\n        \"\"\"\n        if field is None:\n            field = get_name_field(self)\n        if not isinstance(field, str):\n            field = field.field.name\n\n        return view_parents(\n            record=self,  # type: ignore\n            field=field,\n            with_parents=False,\n            with_children=True,\n            distance=distance,\n        )\n\n    def query_parents(self) -> QuerySet:\n        \"\"\"Query parents in an ontology.\"\"\"\n        return _query_relatives([self], \"parents\")  # type: ignore\n\n    def query_children(self) -> QuerySet:\n        \"\"\"Query children in an ontology.\"\"\"\n        return _query_relatives([self], \"children\")  # type: ignore\n\n\ndef view_digraph(u: Digraph):\n    from graphviz.backend import ExecutableNotFound\n\n    try:\n        if is_run_from_ipython:\n            from IPython import get_ipython\n            from IPython.display import display\n\n            #  True if the code is running in a Jupyter Notebook or Lab environment\n            if get_ipython().__class__.__name__ == \"TerminalInteractiveShell\":\n                return u.view()\n            else:\n                # call u._repr_mimebundle_() manually that exception gets raised properly and not just printed by\n                # call to display()\n                display(u._repr_mimebundle_(), raw=True)\n        else:\n            return u.view()\n    except (FileNotFoundError, RuntimeError, ExecutableNotFound):  # pragma: no cover\n        logger.error(\n            \"please install the graphviz executable on your system:\\n  - Ubuntu: `sudo\"\n            \" apt-get install graphviz`\\n  - Windows:\"\n            \" https://graphviz.org/download/#windows\\n  - Mac: `brew install graphviz`\"\n        )\n\n\ndef view_lineage(\n    data: Artifact | Collection, with_children: bool = True, return_graph: bool = False\n) -> Digraph | None:\n    \"\"\"View data lineage graph.\"\"\"\n    if ln_setup.settings.instance.is_on_hub:\n        instance_slug = ln_setup.settings.instance.slug\n        ui_url = ln_setup.settings.instance.ui_url\n        entity_slug = data.__class__.__name__.lower()\n        logger.important(\n            f\"explore at: {ui_url}/{instance_slug}/{entity_slug}/{data.uid}\"\n        )\n\n    import graphviz\n\n    df_values = _get_all_parent_runs(data)\n    if with_children:\n        df_values += _get_all_child_runs(data)\n    df_edges = _df_edges_from_runs(df_values)\n\n    def add_node(\n        record: Run | Artifact | Collection,\n        node_id: str,\n        node_label: str,\n        u: graphviz.Digraph,\n    ):\n        if isinstance(record, Run):\n            fillcolor = TRANSFORM_VIOLET\n        else:\n            fillcolor = \"white\"\n        u.node(\n            node_id,\n            label=node_label,\n            shape=\"box\",\n            style=\"rounded,filled\",\n            fillcolor=fillcolor,\n        )\n\n    u = graphviz.Digraph(\n        f\"{data._meta.model_name}_{data.uid}\",\n        node_attr={\n            \"fillcolor\": \"white\",\n            \"color\": \"darkgrey\",\n            \"fontname\": \"Helvetica\",\n            \"fontsize\": \"10\",\n        },\n        edge_attr={\"arrowsize\": \"0.5\"},\n    )\n\n    for _, row in df_edges.iterrows():\n        add_node(row[\"source_record\"], row[\"source\"], row[\"source_label\"], u)\n        if row[\"target_record\"] not in df_edges[\"source_record\"]:\n            add_node(row[\"target_record\"], row[\"target\"], row[\"target_label\"], u)\n\n        u.edge(row[\"source\"], row[\"target\"], color=\"dimgrey\")\n\n    u.node(\n        f\"{data._meta.model_name}_{data.uid}\",\n        label=get_record_label(data),\n        style=\"rounded,filled\",\n        fillcolor=\"white\",\n        shape=\"box\",\n    )\n\n    if return_graph:\n        return u\n    else:\n        return view_digraph(u)\n\n\ndef view_parents(\n    record: SQLRecord,\n    field: str,\n    with_parents: bool = True,\n    with_children: bool = False,\n    distance: int = 100,\n    attr_name: Literal[\"parents\", \"predecessors\"] = \"parents\",\n):\n    \"\"\"Graph of parents.\"\"\"\n    if not hasattr(record, attr_name):\n        raise NotImplementedError(\n            f\"Parents view is not supported for {record.__class__.__name__}!\"\n        )\n    import graphviz\n    import pandas as pd\n\n    df_edges = None\n    df_edges_parents = None\n    df_edges_children = None\n    if with_parents:\n        df_edges_parents = _df_edges_from_parents(\n            record=record, field=field, distance=distance, attr_name=attr_name\n        )\n    if with_children:\n        df_edges_children = _df_edges_from_parents(\n            record=record,\n            field=field,\n            distance=distance,\n            children=True,\n            attr_name=attr_name,\n        )\n        # Rename the columns to swap source and target\n        df_edges_children = df_edges_children.rename(\n            columns={\n                \"source\": \"temp_target\",\n                \"source_label\": \"temp_target_label\",\n                \"source_record\": \"temp_target_record\",\n                \"target\": \"source\",\n                \"target_label\": \"source_label\",\n                \"target_record\": \"source_record\",\n            }\n        )\n        df_edges_children = df_edges_children.rename(\n            columns={\n                \"temp_target\": \"target\",\n                \"temp_target_label\": \"target_label\",\n                \"temp_target_record\": \"target_record\",\n            }\n        )\n    if df_edges_parents is not None and df_edges_children is not None:\n        df_edges = pd.concat([df_edges_parents, df_edges_children]).drop_duplicates()\n    elif df_edges_parents is not None:\n        df_edges = df_edges_parents\n    elif df_edges_children is not None:\n        df_edges = df_edges_children\n    else:\n        return None\n\n    u = graphviz.Digraph(\n        record.uid,\n        node_attr={\n            \"color\": LAMIN_GREEN_DARKER,\n            \"fillcolor\": GREEN_FILL,\n            \"shape\": \"box\",\n            \"style\": \"rounded,filled\",\n            \"fontname\": \"Helvetica\",\n            \"fontsize\": \"10\",\n        },\n        edge_attr={\"arrowsize\": \"0.5\"},\n    )\n    u.node(\n        record.uid,\n        label=(get_record_label(record)),\n        fillcolor=LAMIN_GREEN_LIGHTER,\n    )\n    if df_edges is not None:\n        for _, row in df_edges.iterrows():\n            u.node(row[\"source\"], label=row[\"source_label\"])\n            u.node(row[\"target\"], label=row[\"target_label\"])\n            u.edge(row[\"source\"], row[\"target\"], color=\"dimgrey\")\n\n    view_digraph(u)\n\n\ndef _get_parents(\n    record: SQLRecord,\n    field: str,\n    distance: int,\n    children: bool = False,\n    attr_name: Literal[\"parents\", \"predecessors\"] = \"parents\",\n):\n    \"\"\"Recursively get parent records within a distance.\"\"\"\n    if children:\n        key = attr_name\n    else:\n        key = \"children\" if attr_name == \"parents\" else \"successors\"  # type: ignore\n\n    using_db = record._state.db\n    model = record.__class__\n    condition = f\"{key}__{field}\"\n    field_value = getattr(record, field)\n\n    results = model.connect(using_db).filter(**{condition: field_value})\n    if distance < 2:\n        return results\n\n    d = 2\n    while d < distance:\n        # this grows in the loop,\n        # i.e. children__children__name -> children__children__children__name -> ...\n        condition = f\"{key}__{condition}\"\n        records = model.connect(using_db).filter(**{condition: field_value})\n\n        try:\n            if not records.exists():\n                return results\n\n            results = results | records\n            d += 1\n        except Exception:\n            # For OperationalError:\n            # SQLite does not support joins containing more than 64 tables\n            return results\n    return results\n\n\ndef _df_edges_from_parents(\n    record: SQLRecord,\n    field: str,\n    distance: int,\n    children: bool = False,\n    attr_name: Literal[\"parents\", \"predecessors\"] = \"parents\",\n):\n    \"\"\"Construct a DataFrame of edges as the input of graphviz.Digraph.\"\"\"\n    if attr_name == \"parents\":\n        key = \"children\" if children else \"parents\"\n    else:\n        key = \"successors\" if children else \"predecessors\"\n\n    parents = _get_parents(\n        record=record,\n        field=field,\n        distance=distance,\n        children=children,\n        attr_name=attr_name,\n    )\n    using_db = record._state.db\n    all = record.__class__.objects.using(using_db)\n    records = parents | all.filter(id=record.id)\n    df = records.distinct().to_dataframe(include=[f\"{key}__id\"])\n    if f\"{key}__id\" not in df.columns:\n        return None\n    df_edges = df[[f\"{key}__id\"]]\n    df_edges = df_edges.explode(f\"{key}__id\")\n    df_edges.index.name = \"target\"\n    df_edges = df_edges.reset_index()\n    df_edges.dropna(axis=0, inplace=True)\n    df_edges.rename(columns={f\"{key}__id\": \"source\"}, inplace=True)\n    df_edges = df_edges.drop_duplicates()\n\n    # colons messes with the node formatting:\n    # https://graphviz.readthedocs.io/en/stable/node_ports.html\n    df_edges[\"source_record\"] = df_edges[\"source\"].apply(lambda x: all.get(id=x))\n    df_edges[\"target_record\"] = df_edges[\"target\"].apply(lambda x: all.get(id=x))\n    if record.__class__.__name__ == \"Transform\":\n        df_edges[\"source_label\"] = df_edges[\"source_record\"].apply(get_record_label)\n        df_edges[\"target_label\"] = df_edges[\"target_record\"].apply(get_record_label)\n    else:\n        df_edges[\"source_label\"] = df_edges[\"source_record\"].apply(\n            lambda x: get_record_label(x, field)\n        )\n        df_edges[\"target_label\"] = df_edges[\"target_record\"].apply(\n            lambda x: get_record_label(x, field)\n        )\n    df_edges[\"source\"] = df_edges[\"source_record\"].apply(lambda x: x.uid)\n    df_edges[\"target\"] = df_edges[\"target_record\"].apply(lambda x: x.uid)\n    return df_edges\n\n\ndef get_record_label(record: SQLRecord, field: str | None = None):\n    from .artifact import Artifact\n    from .collection import Collection\n    from .transform import Transform\n\n    if isinstance(record, (Artifact, Collection, Transform)):\n        title = (\n            record.key.replace(\"&\", \"&amp;\") if record.key is not None else record.uid\n        )\n        return rf\"<{title}>\"\n    elif isinstance(record, Run):\n        title = record.transform.key.replace(\"&\", \"&amp;\")\n        if record.entrypoint is not None:\n            title += f\": {record.entrypoint}\"\n        return (\n            rf'<{title}<BR/><FONT COLOR=\"GREY\" POINT-SIZE=\"10\">'\n            rf\"run at {format_field_value(record.started_at)}</FONT>>\"\n        )\n    else:\n        if field is None:\n            field = get_name_field(record)\n        title = record.__getattribute__(field)\n        return rf\"<{title}>\"\n\n\ndef _get_all_parent_runs(data: Artifact | Collection) -> list:\n    \"\"\"Get all input file/collection runs recursively.\"\"\"\n    name = data._meta.model_name\n    run_inputs_outputs = []\n\n    runs = [data.run] if data.run is not None else []\n    while len(runs) > 0:\n        inputs = []\n        for r in runs:\n            inputs_run = (\n                r.__getattribute__(f\"input_{name}s\")\n                .all()\n                .filter(branch_id__in=[0, 1])\n                .to_list()\n            )\n            if name == \"artifact\":\n                inputs_run += (\n                    r.input_collections.all().filter(branch_id__in=[0, 1]).to_list()\n                )\n            outputs_run = (\n                r.__getattribute__(f\"output_{name}s\")\n                .all()\n                .filter(branch_id__in=[0, 1])\n                .to_list()\n            )\n            if name == \"artifact\":\n                outputs_run += (\n                    r.output_collections.all().filter(branch_id__in=[0, 1]).to_list()\n                )\n            # if inputs are outputs artifacts are the same, will result infinite loop\n            # so only show as outputs\n            overlap = set(inputs_run).intersection(outputs_run)\n            if overlap:\n                logger.warning(\n                    f\"The following artifacts are both inputs and outputs of Run(uid={r.uid}): {overlap}\\n   → Only showing as outputs.\"\n                )\n                inputs_run = list(set(inputs_run) - overlap)\n            if len(inputs_run) > 0:\n                run_inputs_outputs += [(inputs_run, r)]\n            if len(outputs_run) > 0:\n                run_inputs_outputs += [(r, outputs_run)]\n            inputs += inputs_run\n        runs = [f.run for f in inputs if f.run is not None]\n    return run_inputs_outputs\n\n\ndef _get_all_child_runs(data: Artifact | Collection) -> list:\n    \"\"\"Get all output file/collection runs recursively.\"\"\"\n    name = data._meta.model_name\n    all_runs: set[Run] = set()\n    run_inputs_outputs = []\n\n    if data.run is not None:\n        runs = {f.run for f in data.run.__getattribute__(f\"output_{name}s\").all()}\n    else:\n        runs = set()\n    if name == \"artifact\" and data.run is not None:\n        runs.update(\n            {\n                f.run\n                for f in data.run.output_collections.all().filter(branch_id__in=[0, 1])\n            }\n        )\n    while runs.difference(all_runs):\n        all_runs.update(runs)\n        child_runs: set[Run] = set()\n        for r in runs:\n            inputs_run = (\n                r.__getattribute__(f\"input_{name}s\")\n                .all()\n                .filter(branch_id__in=[0, 1])\n                .to_list()\n            )\n            if name == \"artifact\":\n                inputs_run += (\n                    r.input_collections.all().filter(branch_id__in=[0, 1]).to_list()\n                )\n            run_inputs_outputs += [(inputs_run, r)]\n\n            outputs_run = (\n                r.__getattribute__(f\"output_{name}s\")\n                .all()\n                .filter(branch_id__in=[0, 1])\n                .to_list()\n            )\n            if name == \"artifact\":\n                outputs_run += (\n                    r.output_collections.all().filter(branch_id__in=[0, 1]).to_list()\n                )\n            run_inputs_outputs += [(r, outputs_run)]\n\n            child_runs.update(\n                Run.filter(  # type: ignore\n                    **{f\"input_{name}s__uid__in\": [i.uid for i in outputs_run]}\n                ).to_list()\n            )\n            # for artifacts, also include collections in the lineage\n            if name == \"artifact\":\n                child_runs.update(\n                    Run.filter(  # type: ignore\n                        input_collections__uid__in=[i.uid for i in outputs_run]\n                    ).to_list()\n                )\n        runs = child_runs\n    return run_inputs_outputs\n\n\ndef _df_edges_from_runs(df_values: list):\n    import pandas as pd\n\n    df = pd.DataFrame(df_values, columns=[\"source_record\", \"target_record\"])\n    df = df.explode(\"source_record\")\n    df = df.explode(\"target_record\")\n    df = df.drop_duplicates().dropna()\n    df[\"source\"] = [f\"{i._meta.model_name}_{i.uid}\" for i in df[\"source_record\"]]\n    df[\"target\"] = [f\"{i._meta.model_name}_{i.uid}\" for i in df[\"target_record\"]]\n    df[\"source_label\"] = df[\"source_record\"].apply(get_record_label)\n    df[\"target_label\"] = df[\"target_record\"].apply(get_record_label)\n    return df\n"
  },
  {
    "path": "lamindb/models/project.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, overload\n\nfrom django.core.validators import RegexValidator\nfrom django.db import models\nfrom django.db.models import CASCADE, PROTECT\n\nfrom lamindb.base.fields import (\n    BigIntegerField,\n    CharField,\n    DateField,\n    DateTimeField,\n    ForeignKey,\n    TextField,\n    URLField,\n)\nfrom lamindb.base.users import current_user_id\n\nfrom ..base.uids import base62_12\nfrom .artifact import Artifact\nfrom .can_curate import CanCurate\nfrom .collection import Collection\nfrom .feature import Feature\nfrom .has_parents import _query_relatives\nfrom .record import Record\nfrom .run import Run, TracksRun, TracksUpdates, User\nfrom .schema import Schema\nfrom .sqlrecord import BaseSQLRecord, HasType, IsLink, SQLRecord, ValidateFields\nfrom .transform import Transform\nfrom .ulabel import ULabel\n\nif TYPE_CHECKING:\n    from datetime import date as DateType\n    from datetime import datetime\n\n    from .block import Block, ProjectBlock\n    from .query_manager import RelatedManager\n    from .query_set import QuerySet\n    from .sqlrecord import Branch\n\n\nclass Reference(\n    SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates, ValidateFields\n):\n    \"\"\"References such as internal studies, papers, documents, or URLs.\n\n    Example:\n\n        Create a reference object::\n\n            reference = Reference(\n                name=\"A Paper Title\",\n                abbr=\"APT\",\n                url=\"https://doi.org/10.1000/xyz123\",\n                pubmed_id=12345678,\n                doi=\"10.1000/xyz123\",\n                description=\"Good paper.\",\n                text=\"Some text I want to be searchable.\",\n                date=date(2023, 11, 21),\n            ).save()\n\n    \"\"\"\n\n    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):\n        abstract = False\n        app_label = \"lamindb\"\n        # also see raw SQL constraints for `is_type` and `type` FK validity in migrations\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(\n        editable=False, unique=True, max_length=12, db_index=True, default=base62_12\n    )\n    \"\"\"Universal id, valid across DB instances.\"\"\"\n    name: str = CharField(db_index=True)\n    \"\"\"Title or name of the reference document.\"\"\"\n    description: str | None = TextField(null=True)\n    \"\"\"A description.\"\"\"\n    type: Reference | None = ForeignKey(\n        \"self\", PROTECT, null=True, related_name=\"references\"\n    )\n    \"\"\"Type of reference (e.g., 'Study', 'Paper', 'Preprint') ← :attr:`~lamindb.Reference.references`.\n\n    Allows to group reference by type, e.g., internal studies vs. all papers etc.\n    \"\"\"\n    references: RelatedManager[Reference]\n    \"\"\"References of this type (can only be non-empty if `is_type` is `True`).\"\"\"\n    abbr: str | None = CharField(\n        max_length=32,\n        db_index=True,\n        null=True,\n    )\n    \"\"\"An abbreviation for the reference.\"\"\"\n    url: str | None = URLField(null=True, db_index=True)\n    \"\"\"URL linking to the reference.\"\"\"\n    pubmed_id: int | None = BigIntegerField(null=True, db_index=True)\n    \"\"\"A PudMmed ID.\"\"\"\n    doi: str | None = CharField(\n        null=True,\n        db_index=True,\n        validators=[\n            RegexValidator(\n                regex=r\"^(?:https?://(?:dx\\.)?doi\\.org/|doi:|DOI:)?10\\.\\d+/.*$\",\n                message=\"Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)\",\n            )\n        ],\n    )\n    \"\"\"Digital Object Identifier (DOI) for the reference.\"\"\"\n    text: str | None = TextField(null=True)\n    \"\"\"Abstract or full text of the reference to make it searchable.\"\"\"\n    date: DateType | None = DateField(null=True, default=None)\n    \"\"\"Date of creation or publication of the reference.\"\"\"\n    artifacts: RelatedManager[Artifact] = models.ManyToManyField(\n        Artifact, through=\"ArtifactReference\", related_name=\"references\"\n    )\n    \"\"\"Annotated artifacts ← :attr:`~lamindb.Artifact.references`.\"\"\"\n    transforms: RelatedManager[Transform] = models.ManyToManyField(\n        Transform, through=\"TransformReference\", related_name=\"references\"\n    )\n    \"\"\"Annotated transforms ← :attr:`~lamindb.Transform.references`.\"\"\"\n    collections: RelatedManager[Collection] = models.ManyToManyField(\n        Collection, through=\"CollectionReference\", related_name=\"references\"\n    )\n    \"\"\"Annotated collections ← :attr:`~lamindb.Collection.references`.\"\"\"\n    linked_in_records: RelatedManager[Record] = models.ManyToManyField(\n        Record, through=\"RecordReference\", related_name=\"linked_references\"\n    )\n    \"\"\"Linked in records ← :attr:`~lamindb.Record.linked_references`.\"\"\"\n    records: RelatedManager[Record] = models.ManyToManyField(\n        Record, through=\"ReferenceRecord\", related_name=\"references\"\n    )\n    \"\"\"Annotated records ← :attr:`~lamindb.Record.references`.\"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"Projects that annotate this reference ← :attr:`~lamindb.Project.references`.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        name: str,\n        type: Reference | None = None,\n        is_type: bool = False,\n        abbr: str | None = None,\n        url: str | None = None,\n        pubmed_id: int | None = None,\n        doi: str | None = None,\n        description: str | None = None,\n        text: str | None = None,\n        date: DateType | None = None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n    def query_references(self) -> QuerySet:\n        \"\"\"Query references of sub types.\n\n        While `.references` retrieves the references with the current type, this method\n        also retrieves sub types and the references with sub types of the current type.\n        \"\"\"\n        return _query_relatives([self], \"references\")  # type: ignore\n\n\nclass Project(SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates, ValidateFields):\n    \"\"\"Projects to label artifacts, transforms, records, and runs.\n\n    Example:\n\n        Create a project and annotate an artifact with it::\n\n            project = Project(\n                name=\"My Project Name\",\n                abbr=\"MPN\",\n                url=\"https://example.com/my_project\",\n            ).save()\n            artifact.projects.add(project)  # <-- labels the artifact with the project\n            ln.track(project=project)       # <-- automtically labels entities during the run\n\n    \"\"\"\n\n    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):\n        abstract = False\n        app_label = \"lamindb\"\n        # also see raw SQL constraints for `is_type` and `type` FK validity in migrations\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(\n        editable=False, unique=True, max_length=12, db_index=True, default=base62_12\n    )\n    \"\"\"Universal id, valid across DB instances.\"\"\"\n    name: str = CharField(db_index=True)\n    \"\"\"Title or name of the Project.\"\"\"\n    description: str | None = TextField(null=True)\n    \"\"\"A description.\"\"\"\n    type: Project | None = ForeignKey(\n        \"self\", PROTECT, null=True, related_name=\"projects\"\n    )\n    \"\"\"Type of project (e.g., 'Program', 'Project', 'GithubIssue', 'Task') ← :attr:`~lamindb.Project.projects`.\"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"Projects of this type (can only be non-empty if `is_type` is `True`).\"\"\"\n    abbr: str | None = CharField(max_length=32, db_index=True, null=True)\n    \"\"\"An abbreviation.\"\"\"\n    url: str | None = URLField(max_length=255, null=True, default=None)\n    \"\"\"A URL.\"\"\"\n    start_date: DateType | None = DateField(null=True, default=None)\n    \"\"\"Date of start of the project.\"\"\"\n    end_date: DateType | None = DateField(null=True, default=None)\n    \"\"\"Date of start of the project.\"\"\"\n    parents: RelatedManager[Project] = models.ManyToManyField(\n        \"self\", symmetrical=False, related_name=\"children\"\n    )\n    \"\"\"Parent projects, the super-projects owning this project ← :attr:`~lamindb.Project.children`.\"\"\"\n    children: RelatedManager[Project]\n    \"\"\"Child projects, the sub-projects owned by this project.\n\n    Reverse accessor for `.parents`.\n    \"\"\"\n    predecessors: RelatedManager[Project] = models.ManyToManyField(\n        \"self\", symmetrical=False, related_name=\"successors\"\n    )\n    \"\"\"The preceding projects required by this project ← :attr:`~lamindb.Project.successors`.\"\"\"\n    successors: RelatedManager[Project]\n    \"\"\"The succeeding projects requiring this project.\n\n    Reverse accessor for `.predecessors`.\n    \"\"\"\n    artifacts: RelatedManager[Artifact] = models.ManyToManyField(\n        Artifact, through=\"ArtifactProject\", related_name=\"projects\"\n    )\n    \"\"\"Annotated artifacts ← :attr:`~lamindb.Artifact.projects`.\"\"\"\n    transforms: RelatedManager[Transform] = models.ManyToManyField(\n        Transform, through=\"TransformProject\", related_name=\"projects\"\n    )\n    \"\"\"Annotated transforms ← :attr:`~lamindb.Transform.projects`.\"\"\"\n    runs: RelatedManager[Run] = models.ManyToManyField(\n        Run, through=\"RunProject\", related_name=\"projects\"\n    )\n    \"\"\"Annotated runs ← :attr:`~lamindb.Run.projects`.\"\"\"\n    ulabels: RelatedManager[ULabel] = models.ManyToManyField(\n        ULabel, through=\"ULabelProject\", related_name=\"projects\"\n    )\n    \"\"\"Annotated ulabels ← :attr:`~lamindb.ULabel.projects`.\"\"\"\n    features: RelatedManager[Feature] = models.ManyToManyField(\n        Feature, through=\"FeatureProject\", related_name=\"projects\"\n    )\n    \"\"\"Annotated features ← :attr:`~lamindb.Feature.projects`.\"\"\"\n    schemas: RelatedManager[Schema] = models.ManyToManyField(\n        Schema, through=\"SchemaProject\", related_name=\"projects\"\n    )\n    \"\"\"Annotated schemas ← :attr:`~lamindb.Schema.projects`.\"\"\"\n    linked_in_records: RelatedManager[Record] = models.ManyToManyField(\n        Record, through=\"RecordProject\", related_name=\"linked_projects\"\n    )\n    \"\"\"Linked in records ← :attr:`~lamindb.Record.linked_projects`.\"\"\"\n    records: RelatedManager[Record] = models.ManyToManyField(\n        Record, through=\"ProjectRecord\", related_name=\"projects\"\n    )\n    \"\"\"Annotated records ← :attr:`~lamindb.Record.projects`.\"\"\"\n    collections: RelatedManager[Collection] = models.ManyToManyField(\n        Collection, through=\"CollectionProject\", related_name=\"projects\"\n    )\n    \"\"\"Annotated collections ← :attr:`~lamindb.Collection.projects`.\"\"\"\n    references: RelatedManager[Reference] = models.ManyToManyField(\n        \"Reference\", related_name=\"projects\"\n    )\n    \"\"\"Annotated references ← :attr:`~lamindb.Reference.projects`.\"\"\"\n    blocks: RelatedManager[Block] = models.ManyToManyField(\n        \"Block\", through=\"BlockProject\", related_name=\"projects\"\n    )\n    \"\"\"Annotated blocks ← :attr:`~lamindb.Block.projects`.\"\"\"\n    users: RelatedManager[User] = models.ManyToManyField(\n        \"User\",\n        through=\"ProjectUser\",\n        related_name=\"projects\",\n    )\n    \"\"\"Users participating in this project ← :attr:`~lamindb.ProjectUser.user`.\"\"\"\n    branches: RelatedManager[Branch]\n    \"\"\"Annotated branches ← :attr:`~lamindb.BranchProject.project`.\"\"\"\n    _status_code: int = models.SmallIntegerField(default=0, db_default=0, db_index=True)\n    \"\"\"Status code.\"\"\"\n    ablocks: RelatedManager[ProjectBlock]\n    \"\"\"Attached blocks ← :attr:`~lamindb.ProjectBlock.project`.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        name: str,\n        type: Project | None = None,\n        is_type: bool = False,\n        abbr: str | None = None,\n        url: str | None = None,\n        start_date: DateType | None = None,\n        end_date: DateType | None = None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n    def query_projects(self) -> QuerySet:\n        \"\"\"Query projects of sub types.\n\n        While `.projects` retrieves the projects with the current type, this method\n        also retrieves sub types and the projects with sub types of the current type.\n        \"\"\"\n        return _query_relatives([self], \"projects\")  # type: ignore\n\n\nclass ArtifactProject(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name=\"links_project\")\n    project: Project = ForeignKey(Project, PROTECT, related_name=\"links_artifact\")\n    feature: Feature | None = ForeignKey(\n        Feature,\n        PROTECT,\n        null=True,\n        default=None,\n        related_name=\"links_artifactproject\",\n    )\n\n    class Meta:\n        app_label = \"lamindb\"\n        # can have the same label linked to the same artifact if the feature is different\n        unique_together = (\"artifact\", \"project\", \"feature\")\n\n\nclass RunProject(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    run: Run = ForeignKey(Run, CASCADE, related_name=\"links_project\")\n    project: Project = ForeignKey(Project, PROTECT, related_name=\"links_run\")\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of creation of record.\"\"\"\n    created_by: User = ForeignKey(\n        \"lamindb.User\",\n        PROTECT,\n        editable=False,\n        default=current_user_id,\n        related_name=\"+\",\n    )\n    \"\"\"Creator of record.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"run\", \"project\")\n\n\nclass BranchProject(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    branch: Branch = ForeignKey(\"Branch\", CASCADE, related_name=\"links_project\")\n    project: Project = ForeignKey(Project, PROTECT, related_name=\"links_branch\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"branch\", \"project\")\n\n\nclass ProjectUser(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    project: Project = ForeignKey(Project, CASCADE, related_name=\"links_user\")\n    user: User = ForeignKey(\"User\", PROTECT, related_name=\"links_project\")\n    role: str = CharField(max_length=32, db_index=True)\n    \"\"\"Role (e.g. \"responsible\", \"viewer\").\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"project\", \"user\", \"role\")\n\n\nclass TransformProject(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    transform: Transform = ForeignKey(Transform, CASCADE, related_name=\"links_project\")\n    project: Project = ForeignKey(Project, PROTECT, related_name=\"links_transform\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"transform\", \"project\")\n\n\nclass CollectionProject(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    collection: Collection = ForeignKey(\n        Collection, CASCADE, related_name=\"links_project\"\n    )\n    project: Project = ForeignKey(Project, PROTECT, related_name=\"links_collection\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"collection\", \"project\")\n\n\nclass ULabelProject(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    ulabel: ULabel = ForeignKey(ULabel, CASCADE, related_name=\"links_project\")\n    project: Project = ForeignKey(Project, PROTECT, related_name=\"links_ulabel\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"ulabel\", \"project\")\n\n\nclass FeatureProject(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    feature: Feature = ForeignKey(Feature, CASCADE, related_name=\"links_project\")\n    project: Project = ForeignKey(Project, PROTECT, related_name=\"links_feature\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"feature\", \"project\")\n\n\nclass SchemaProject(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    schema: Schema = ForeignKey(Schema, CASCADE, related_name=\"links_project\")\n    project: Project = ForeignKey(Project, PROTECT, related_name=\"links_schema\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"schema\", \"project\")\n\n\n# for annotation of records with references, RecordReference is for storing reference values\nclass ReferenceRecord(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    reference: Reference = ForeignKey(Reference, PROTECT, related_name=\"links_record\")\n    feature: Feature | None = ForeignKey(\n        Feature,\n        PROTECT,\n        null=True,\n        default=None,\n        related_name=\"links_referencerecord\",\n    )\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"links_reference\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"reference\", \"feature\", \"record\")\n\n\nclass RecordReference(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"values_reference\")\n    feature: Feature = ForeignKey(\n        Feature, PROTECT, related_name=\"links_recordreference\"\n    )\n    value: Reference = ForeignKey(Reference, PROTECT, related_name=\"links_in_record\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"record\", \"feature\", \"value\")\n\n\n# for annotation of records with projects, RecordProject is for storing project values\nclass ProjectRecord(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    project: Project = ForeignKey(Project, PROTECT, related_name=\"links_record\")\n    feature: Feature | None = ForeignKey(\n        Feature,\n        PROTECT,\n        null=True,\n        default=None,\n        related_name=\"links_projectrecord\",\n    )\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"links_project\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"project\", \"feature\", \"record\")\n\n\nclass RecordProject(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"values_project\")\n    feature: Feature = ForeignKey(Feature, PROTECT, related_name=\"links_recordproject\")\n    value: Project = ForeignKey(Project, PROTECT, related_name=\"links_in_record\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"record\", \"feature\", \"value\")\n\n\nclass BlockProject(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    block = ForeignKey(\"Block\", CASCADE, related_name=\"links_project\")\n    project: Project = ForeignKey(Project, PROTECT, related_name=\"links_block\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"block\", \"project\")\n\n\nclass ArtifactReference(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name=\"links_reference\")\n    reference: Reference = ForeignKey(Reference, PROTECT, related_name=\"links_artifact\")\n    feature: Feature | None = ForeignKey(\n        Feature,\n        PROTECT,\n        null=True,\n        default=None,\n        related_name=\"links_artifactreference\",\n    )\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"artifact\", \"reference\", \"feature\")\n\n\nclass TransformReference(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    transform: Transform = ForeignKey(\n        Transform, CASCADE, related_name=\"links_reference\"\n    )\n    reference: Reference = ForeignKey(\n        Reference, PROTECT, related_name=\"links_transform\"\n    )\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"transform\", \"reference\")\n\n\nclass CollectionReference(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    collection: Collection = ForeignKey(\n        Collection, CASCADE, related_name=\"links_reference\"\n    )\n    reference: Reference = ForeignKey(\n        Reference, PROTECT, related_name=\"links_collection\"\n    )\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"collection\", \"reference\")\n"
  },
  {
    "path": "lamindb/models/query_manager.py",
    "content": "from __future__ import annotations\n\nimport re\nfrom functools import reduce\nfrom typing import TYPE_CHECKING, Literal, NamedTuple\n\nfrom django.db.models import (\n    IntegerField,\n    Manager,\n    Q,\n    QuerySet,\n    TextField,\n    Value,\n)\nfrom django.db.models.functions import Cast, Coalesce\nfrom django.db.models.lookups import (\n    Contains,\n    Exact,\n    IContains,\n    IExact,\n    IRegex,\n    IStartsWith,\n    Regex,\n    StartsWith,\n)\nfrom lamin_utils._lookup import Lookup\nfrom lamindb_setup.core import deprecated\nfrom lamindb_setup.core._docs import doc_args\n\nif TYPE_CHECKING:\n    from ..base.types import StrField\n\n\ndef _search(\n    cls,\n    string: str,\n    *,\n    field: StrField | list[StrField] | None = None,\n    limit: int | None = 20,\n    case_sensitive: bool = False,\n    truncate_string: bool = False,\n) -> QuerySet:\n    \"\"\"Search.\n\n    Args:\n        string: The input string to match against the field ontology values.\n        field: The field or fields to search. Search all string fields by default.\n        limit: Maximum amount of top results to return.\n        case_sensitive: Whether the match is case sensitive.\n\n    Returns:\n        A sorted `DataFrame` of search results with a score in column `score`.\n        If `return_queryset` is `True`.  `QuerySet`.\n\n    See Also:\n        :meth:`~lamindb.models.SQLRecord.filter`\n        :meth:`~lamindb.models.SQLRecord.lookup`\n\n    Examples:\n\n        ::\n\n            records = ln.ULabel.from_values([\"Label1\", \"Label2\", \"Label3\"]).save()\n            ln.ULabel.search(\"Label2\")\n    \"\"\"\n    if string is None:\n        raise ValueError(\"Cannot search for None value! Please pass a valid string.\")\n\n    input_queryset = (\n        cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all()\n    )\n    registry = input_queryset.model\n    name_field = getattr(registry, \"_name_field\", \"name\")\n    if field is None:\n        fields = [\n            field.name\n            for field in registry._meta.fields\n            if field.get_internal_type() in {\"CharField\", \"TextField\"}\n        ]\n    else:\n        if not isinstance(field, list):\n            fields_input = [field]\n        else:\n            fields_input = field\n        fields = []\n        for field in fields_input:\n            if not isinstance(field, str):\n                try:\n                    fields.append(field.field.name)\n                except AttributeError as error:\n                    raise TypeError(\n                        \"Please pass a SQLRecord string field, e.g., `CellType.name`!\"\n                    ) from error\n            else:\n                fields.append(field)\n\n    if truncate_string:\n        if (len_string := len(string)) > 5:\n            n_80_pct = int(len_string * 0.8)\n            string = string[:n_80_pct]\n\n    string = string.strip()\n    string_escape = re.escape(string)\n\n    exact_lookup = Exact if case_sensitive else IExact\n    regex_lookup = Regex if case_sensitive else IRegex\n    contains_lookup = Contains if case_sensitive else IContains\n\n    ranks = []\n    contains_filters = []\n    for field in fields:\n        field_expr = Coalesce(\n            Cast(field, output_field=TextField()),\n            Value(\"\"),\n            output_field=TextField(),\n        )\n        # exact rank\n        exact_expr = exact_lookup(field_expr, string)\n        exact_rank = Cast(exact_expr, output_field=IntegerField()) * 200\n        ranks.append(exact_rank)\n        # exact synonym\n        synonym_expr = regex_lookup(field_expr, rf\"(?:^|.*\\|){string_escape}(?:\\|.*|$)\")\n        synonym_rank = Cast(synonym_expr, output_field=IntegerField()) * 200\n        ranks.append(synonym_rank)\n        # match as sub-phrase\n        sub_expr = regex_lookup(\n            field_expr, rf\"(?:^|.*[ \\|\\.,;:]){string_escape}(?:[ \\|\\.,;:].*|$)\"\n        )\n        sub_rank = Cast(sub_expr, output_field=IntegerField()) * 10\n        ranks.append(sub_rank)\n        # startswith and avoid matching string with \" \" on the right\n        # mostly for truncated\n        startswith_expr = regex_lookup(\n            field_expr, rf\"(?:^|.*\\|){string_escape}[^ ]*(?:\\|.*|$)\"\n        )\n        startswith_rank = Cast(startswith_expr, output_field=IntegerField()) * 8\n        ranks.append(startswith_rank)\n        # match as sub-phrase from the left, mostly for truncated\n        right_expr = regex_lookup(field_expr, rf\"(?:^|.*[ \\|]){string_escape}.*\")\n        right_rank = Cast(right_expr, output_field=IntegerField()) * 2\n        ranks.append(right_rank)\n        # match as sub-phrase from the right\n        left_expr = regex_lookup(field_expr, rf\".*{string_escape}(?:$|[ \\|\\.,;:].*)\")\n        left_rank = Cast(left_expr, output_field=IntegerField()) * 2\n        ranks.append(left_rank)\n        # simple contains filter\n        contains_expr = contains_lookup(field_expr, string)\n        contains_filter = Q(contains_expr)\n        contains_filters.append(contains_filter)\n        # also rank by contains\n        contains_rank = Cast(contains_expr, output_field=IntegerField())\n        ranks.append(contains_rank)\n        # additional rule for truncated strings\n        # weight matches from the beginning of the string higher\n        # sometimes whole words get truncated and startswith_expr is not enough\n        if truncate_string and field == name_field:\n            startswith_lookup = StartsWith if case_sensitive else IStartsWith\n            name_startswith_expr = startswith_lookup(field_expr, string)\n            name_startswith_rank = (\n                Cast(name_startswith_expr, output_field=IntegerField()) * 2\n            )\n            ranks.append(name_startswith_rank)\n\n    ranked_queryset = (\n        input_queryset.filter(reduce(lambda a, b: a | b, contains_filters))\n        .alias(rank=sum(ranks))\n        .order_by(\"-rank\")\n    )\n\n    return ranked_queryset[:limit]\n\n\ndef _lookup(\n    cls,\n    field: StrField | None = None,\n    return_field: StrField | None = None,\n    using_key: str | None = None,\n    keep: Literal[\"first\", \"last\", False] = \"first\",\n) -> NamedTuple:\n    \"\"\"Return an auto-complete object for a field.\n\n    Args:\n        field: The field to look up the values for. Defaults to first string field.\n        return_field: The field to return. If `None`, returns the whole record.\n        keep: When multiple records are found for a lookup, how to return the records.\n            - `\"first\"`: return the first record.\n            - `\"last\"`: return the last record.\n            - `False`: return all records.\n\n    Returns:\n        A `NamedTuple` of lookup information of the field values with a\n        dictionary converter.\n\n    See Also:\n        :meth:`~lamindb.models.SQLRecord.search`\n\n    Examples:\n\n        Lookup via auto-complete on `.`::\n\n            import bionty as bt\n            bt.Gene.from_source(symbol=\"ADGB-DT\").save()\n            lookup = bt.Gene.lookup()\n            lookup.adgb_dt\n\n        Look up via auto-complete in dictionary::\n\n            lookup_dict = lookup.dict()\n            lookup_dict['ADGB-DT']\n\n        Look up via a specific field::\n\n            lookup_by_ensembl_id = bt.Gene.lookup(field=\"ensembl_gene_id\")\n            genes.ensg00000002745\n\n        Return a specific field value instead of the full record::\n\n            lookup_return_symbols = bt.Gene.lookup(field=\"ensembl_gene_id\", return_field=\"symbol\")\n    \"\"\"\n    from .sqlrecord import get_name_field\n\n    queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all()\n    field = get_name_field(registry=queryset.model, field=field)\n\n    return Lookup(\n        records=queryset,\n        values=[i.get(field) for i in queryset.values()],\n        tuple_name=cls.__class__.__name__,\n        prefix=\"ln\",\n        keep=keep,\n    ).lookup(\n        return_field=(\n            get_name_field(registry=queryset.model, field=return_field)\n            if return_field is not None\n            else None\n        )\n    )\n\n\n# this is the default (._default_manager and ._base_manager) for lamindb models\nclass QueryManager(Manager):\n    \"\"\"Manage queries through fields.\n\n    See Also:\n\n        :class:`lamindb.models.QuerySet`\n\n        `django Manager <https://docs.djangoproject.com/en/4.2/topics/db/managers/>`__\n\n    Examples:\n\n        Populate the `.parents` ManyToMany relationship (a `QueryManager`)::\n\n            ln.ULabel.from_values([\"Label1\", \"Label2\", \"Label3\"]).save()\n            labels = ln.ULabel.filter(name__icontains=\"label\")\n            label1 = ln.ULabel.get(name=\"Label1\")\n            label1.parents.set(labels)\n\n        Convert all linked parents to a `DataFrame`::\n\n            label1.parents.to_dataframe()\n    \"\"\"\n\n    def to_list(self, field: str | None = None):\n        \"\"\"Populate a list.\"\"\"\n        if field is None:\n            return list(self.all())\n        else:\n            return list(self.values_list(field, flat=True))\n\n    def to_dataframe(self, **kwargs):\n        \"\"\"Convert to DataFrame.\n\n        For `**kwargs`, see :meth:`lamindb.models.QuerySet.to_dataframe`.\n        \"\"\"\n        return self.all().to_dataframe(**kwargs)\n\n    @deprecated(new_name=\"to_dataframe\")\n    def df(self, **kwargs):\n        return self.to_dataframe(**kwargs)\n\n    @doc_args(_search.__doc__)\n    def search(self, string: str, **kwargs):\n        \"\"\"{}\"\"\"  # noqa: D415\n        return _search(cls=self.all(), string=string, **kwargs)\n\n    @doc_args(_lookup.__doc__)\n    def lookup(self, field: StrField | None = None, **kwargs) -> NamedTuple:\n        \"\"\"{}\"\"\"  # noqa: D415\n        return _lookup(cls=self.all(), field=field, **kwargs)\n\n    def get_queryset(self):\n        from .query_set import BasicQuerySet\n\n        # QueryManager returns BasicQuerySet because it is problematic to redefine .filter and .get\n        # for a query set used by the default manager\n        return BasicQuerySet(model=self.model, using=self._db, hints=self._hints)\n\n\n# below is just for typing / docs\n# Django achieves the same thing with a dynamically generated class\nclass RelatedManager(QueryManager):\n    \"\"\"Manager for many-to-many and reverse foreign key relationships.\n\n    Provides relationship manipulation methods.\n\n    See Also:\n        :class:`lamindb.models.QueryManager`\n\n    Examples:\n\n        Populate the `.parents` ManyToMany relationship (a `RelatedManager`)::\n\n            ln.ULabel.from_values([\"Label1\", \"Label2\", \"Label3\"]).save()\n            labels = ln.ULabel.filter(name__icontains=\"label\")\n            label1 = ln.ULabel.get(name=\"Label1\")\n            label1.parents.set(labels)\n\n        Convert all linked parents to a `DataFrame`::\n\n            label1.parents.to_dataframe()\n\n        Remove a parent label::\n\n            label1.parents.remove(label2)\n\n        Clear all parent labels::\n\n            label1.parents.clear()\n\n    \"\"\"\n\n    def add(self, *objs, bulk: bool = True) -> None:\n        \"\"\"Add objects to the relationship.\"\"\"\n        ...\n\n    def set(self, objs, *, bulk: bool = True, clear: bool = False) -> None:\n        \"\"\"Set the relationship to the specified objects.\"\"\"\n        ...\n\n    def remove(self, *objs, bulk: bool = True) -> None:\n        \"\"\"Remove objects from the relationship.\"\"\"\n        ...\n\n    def clear(self) -> None:\n        \"\"\"Remove all objects from the relationship.\"\"\"\n        ...\n"
  },
  {
    "path": "lamindb/models/query_set.py",
    "content": "from __future__ import annotations\n\nimport ast\nimport re\nimport warnings\nfrom collections import UserList, defaultdict\nfrom collections.abc import Iterable\nfrom collections.abc import Iterable as IterableType\nfrom importlib import import_module\nfrom typing import TYPE_CHECKING, Any, Generic, NamedTuple, TypeVar, final\n\nimport lamindb_setup as ln_setup\nfrom django.core.exceptions import FieldError\nfrom django.db import models\nfrom django.db.models import (\n    F,\n    FilteredRelation,\n    ForeignKey,\n    ManyToManyField,\n    Q,\n    Subquery,\n)\nfrom django.db.models.fields.related import ForeignObjectRel\nfrom lamin_utils import logger\nfrom lamindb_setup import settings as setup_settings\nfrom lamindb_setup.core import deprecated\nfrom lamindb_setup.core._docs import doc_args\n\nfrom ..base.types import BRANCH_STATUS_TO_CODE, RUN_STATUS_TO_CODE\nfrom ..errors import DoesNotExist, MultipleResultsFound\nfrom ._is_versioned import IsVersioned, _adjust_is_latest_when_deleting_is_versioned\nfrom .can_curate import CanCurate, _inspect, _standardize, _validate\nfrom .query_manager import _lookup, _search\nfrom .sqlrecord import Registry, SQLRecord\n\nif TYPE_CHECKING:\n    import pandas as pd\n    from bionty.models import (\n        CellLine,\n        CellMarker,\n        CellType,\n        DevelopmentalStage,\n        Disease,\n        Ethnicity,\n        ExperimentalFactor,\n        Gene,\n        Organism,\n        Pathway,\n        Phenotype,\n        Protein,\n        Tissue,\n    )\n    from pertdb.models import (\n        Biologic,\n        CombinationPerturbation,\n        Compound,\n        CompoundPerturbation,\n        EnvironmentalPerturbation,\n        GeneticPerturbation,\n        PerturbationTarget,\n    )\n\n    from lamindb.base.types import ListLike, StrField\n    from lamindb.models import (\n        Artifact,\n        Branch,\n        Collection,\n        Feature,\n        Project,\n        Record,\n        Reference,\n        Run,\n        Schema,\n        Space,\n        Storage,\n        Transform,\n        ULabel,\n        User,\n    )\n\nT = TypeVar(\"T\")\n\n\ndef get_keys_from_df(data: list, registry: SQLRecord) -> list[str]:\n    if len(data) > 0:\n        if isinstance(data[0], dict):\n            keys = list(data[0].keys())\n        else:\n            keys = list(data[0].__dict__.keys())\n            if \"_state\" in keys:\n                keys.remove(\"_state\")\n    else:\n        keys = [\n            field.name\n            for field in registry._meta.fields\n            if not isinstance(field, models.ForeignKey)\n        ]\n        keys += [\n            f\"{field.name}_id\"\n            for field in registry._meta.fields\n            if isinstance(field, models.ForeignKey)\n        ]\n    return keys\n\n\ndef get_default_branch_ids(branch: Branch | None = None) -> list[int]:\n    \"\"\"Return branch IDs to include in default queries.\n\n    By default, queries include records on the main branch (branch_id=1) but exclude trashed (branch_id=-1)\n    and archived records (branch_id=0). This matches behavior of familiar tools like GitHub, Slack, and\n    email clients.\n\n    If a user switches to another branch via `lamin switch branch`, the main branch will still be included.\n\n    Returns:\n        List containing the default branch and current branch if different.\n    \"\"\"\n    if branch is None:\n        branch_id = setup_settings.branch.id\n    else:\n        branch_id = branch.id\n    branch_ids = [branch_id]\n    if branch_id != 1:  # add the main branch by default\n        branch_ids.append(1)\n    return branch_ids\n\n\ndef one_helper(\n    self: QuerySet | SQLRecordList,\n    does_not_exist_msg: str | None = None,\n    raise_doesnotexist: bool = True,\n    not_exists: bool | None = None,\n    raise_multipleresultsfound: bool = True,\n):\n    if not_exists is None:\n        if isinstance(self, SQLRecordList):\n            not_exists = len(self) == 0\n        else:\n            not_exists = not self.exists()  # type: ignore\n    if not_exists:\n        if raise_doesnotexist:\n            raise DoesNotExist(does_not_exist_msg)\n        else:\n            return None\n    elif len(self) > 1:\n        if raise_multipleresultsfound:\n            raise MultipleResultsFound(self)\n        else:\n            return self[0]\n    else:\n        return self[0]\n\n\ndef get_backward_compat_filter_kwargs(queryset, expressions):\n    from lamindb.models import (\n        Artifact,\n        Branch,\n        Feature,\n        Project,\n        Run,\n    )\n\n    if issubclass(queryset.model, IsVersioned):\n        name_mappings = {\n            \"version\": \"version_tag\",\n        }\n    else:\n        name_mappings = {}\n\n    if queryset.model is Artifact:\n        name_mappings.update(\n            {\n                \"transform\": \"run__transform\",\n                \"feature_sets\": \"schemas\",\n            }\n        )\n    if queryset.model is Feature:\n        name_mappings.update(\n            {\n                \"dtype\": \"_dtype_str\",\n                \"dtype_as_str\": \"_dtype_str\",\n            }\n        )\n    if queryset.model in {Run, Branch, Project}:\n        name_mappings.update(\n            {\n                \"status\": \"_status_code\",\n            }\n        )\n\n    # If no mappings to apply, return expressions as-is\n    if not name_mappings:\n        return expressions\n    was_list = False\n    if isinstance(expressions, list):\n        was_list = True\n        expressions = {field: True for field in expressions}\n    mapped = {}\n    status_mapping = None\n    if queryset.model is Run:\n        status_mapping = RUN_STATUS_TO_CODE\n    elif queryset.model is Branch:\n        status_mapping = BRANCH_STATUS_TO_CODE\n\n    def _map_status_value(value):\n        if status_mapping is None:\n            return value\n        if isinstance(value, str):\n            if value not in status_mapping:\n                expected = \", \".join(f\"'{status}'\" for status in status_mapping)\n                raise ValueError(\n                    f\"Invalid {queryset.model.__name__} status '{value}'. \"\n                    f\"Expected one of: {expected}.\"\n                )\n            return status_mapping[value]\n        if isinstance(value, IterableType) and not isinstance(value, str):\n            return [\n                status_mapping[v] if isinstance(v, str) and v in status_mapping else v\n                for v in value\n            ]\n        return value\n\n    for field, value in expressions.items():\n        parts = field.split(\"__\")\n        if parts[0] in name_mappings:\n            # Issue deprecation warnings\n            if queryset.model is Artifact and parts[0] == \"feature_sets\":\n                warnings.warn(\n                    \"Querying Artifact by `feature_sets` is deprecated. Use `schemas` instead.\",\n                    DeprecationWarning,\n                    stacklevel=4,\n                )\n            elif queryset.model is Feature and parts[0] == \"dtype\":\n                warnings.warn(\n                    \"Querying Feature by `dtype` is deprecated. Use `dtype_as_str` instead. \"\n                    \"Notice the new dtype encoding format for Record and ULabel subtypes.\",\n                    DeprecationWarning,\n                    stacklevel=4,\n                )\n            new_field = name_mappings[parts[0]] + (\n                \"__\" + \"__\".join(parts[1:]) if len(parts) > 1 else \"\"\n            )\n            mapped[new_field] = (\n                _map_status_value(value) if parts[0] == \"status\" else value\n            )\n        else:\n            mapped[field] = value\n    return list(mapped.keys()) if was_list else mapped\n\n\ndef process_expressions(queryset: QuerySet, queries: tuple, expressions: dict) -> dict:\n    def _map_databases(value: Any, key: str, target_db: str) -> tuple[str, Any]:\n        if isinstance(value, SQLRecord):\n            if value._state.db != target_db:\n                logger.warning(\n                    f\"passing record from database {value._state.db} to query {target_db}, matching on uid '{value.uid}'\"\n                )\n                return f\"{key}__uid\", value.uid\n            return key, value\n\n        if (\n            key.endswith(\"__in\")\n            and isinstance(value, IterableType)\n            and not isinstance(value, str)\n        ):\n            if any(\n                isinstance(v, SQLRecord) and v._state.db != target_db for v in value\n            ):\n                logger.warning(\n                    f\"passing records from another database to query {target_db}, matching on uids\"\n                )\n                return key.replace(\"__in\", \"__uid__in\"), [\n                    v.uid if isinstance(v, SQLRecord) else v for v in value\n                ]\n            return key, value\n\n        return key, value\n\n    branch_fields = {\"branch\", \"branch_id\"}\n    branch_prefixes = (\"branch__\", \"branch_id__\")\n\n    def queries_contain_branch(queries: tuple) -> bool:\n        \"\"\"Check if any Q object in queries references branch or branch_id.\"\"\"\n\n        def check_q_object(q: Q) -> bool:\n            # Q objects store their conditions in q.children\n            for child in q.children:\n                if isinstance(child, tuple) and len(child) == 2:\n                    # Normal condition: (key, value)\n                    key = child[0]\n                    if key in branch_fields or key.startswith(branch_prefixes):\n                        return True\n                elif isinstance(child, Q):\n                    # Nested Q object\n                    if check_q_object(child):\n                        return True\n            return False\n\n        return any(check_q_object(q) for q in queries if isinstance(q, Q))\n\n    expressions = get_backward_compat_filter_kwargs(\n        queryset,\n        expressions,\n    )\n    model_has_branch = any(\n        field.name == \"branch\" for field in queryset.model._meta.concrete_fields\n    )\n    if issubclass(queryset.model, SQLRecord) or model_has_branch:\n        # branch_id is set to 1 unless expressions contains id, uid or hash\n        id_uid_hash = {\"id\", \"uid\", \"hash\", \"id__in\", \"uid__in\", \"hash__in\"}\n        if not any(expression in id_uid_hash for expression in expressions):\n            expressions_have_branch = False\n            for expression in expressions:\n                if expression in branch_fields or expression.startswith(\n                    branch_prefixes\n                ):\n                    expressions_have_branch = True\n                    break\n            if not expressions_have_branch and not queries_contain_branch(queries):\n                expressions[\"branch_id__in\"] = get_default_branch_ids()\n            else:\n                # if branch_id is None, do not apply a filter\n                # otherwise, it would mean filtering for NULL values, which doesn't make\n                # sense for a non-NULLABLE column\n                if \"branch_id\" in expressions and expressions[\"branch_id\"] is None:\n                    expressions.pop(\"branch_id\")\n                if \"branch\" in expressions and expressions[\"branch\"] is None:\n                    expressions.pop(\"branch\")\n\n    if queryset._db is not None:\n        # only check for database mismatch if there is a defined database on the\n        # queryset\n        return dict(\n            (\n                _map_databases(value, key, queryset._db)\n                for key, value in expressions.items()\n            )\n        )\n    else:\n        return expressions\n\n\ndef get(\n    registry_or_queryset: Registry | BasicQuerySet,\n    idlike: int | str | None = None,\n    **expressions,\n) -> SQLRecord:\n    if isinstance(registry_or_queryset, BasicQuerySet):\n        # not QuerySet but only BasicQuerySet\n        assert not isinstance(registry_or_queryset, QuerySet)  # noqa: S101\n\n        qs = registry_or_queryset\n        registry = qs.model\n    else:\n        qs = BasicQuerySet(model=registry_or_queryset)\n        registry = registry_or_queryset\n\n    if isinstance(idlike, int):\n        return qs.get(id=idlike)\n    elif isinstance(idlike, str):\n        NAME_FIELD = (\n            registry._name_field if hasattr(registry, \"_name_field\") else \"name\"\n        )\n        DOESNOTEXIST_MSG = f\"No record found with uid '{idlike}'. Did you forget a keyword as in {registry.__name__}.get({NAME_FIELD}='{idlike}')?\"\n        # this is the case in which the user passes an under-specified uid\n        if issubclass(registry, IsVersioned) and len(idlike) <= registry._len_stem_uid:\n            new_qs = qs.filter(uid__startswith=idlike, is_latest=True)\n            not_exists = None\n            if not new_qs.exists():\n                # also try is_latest is False due to nothing found\n                new_qs = qs.filter(uid__startswith=idlike, is_latest=False)\n            else:\n                not_exists = False\n            # it doesn't make sense to raise MultipleResultsFound when querying with an\n            # underspecified uid\n            return one_helper(\n                new_qs,\n                DOESNOTEXIST_MSG,\n                not_exists=not_exists,\n                raise_multipleresultsfound=False,\n            )\n        else:\n            qs = qs.filter(uid__startswith=idlike)\n            return one_helper(qs, DOESNOTEXIST_MSG)\n    else:\n        assert idlike is None  # noqa: S101\n        expressions = process_expressions(qs, [], expressions)\n        # inject is_latest for consistency with idlike\n        is_latest_was_not_in_expressions = \"is_latest\" not in expressions\n        if issubclass(registry, IsVersioned) and is_latest_was_not_in_expressions:\n            expressions[\"is_latest\"] = True\n        try:\n            return qs.get(**expressions)\n        except registry.DoesNotExist as e:\n            # handle the case in which the is_latest injection led to a missed query\n            if \"is_latest\" in expressions and is_latest_was_not_in_expressions:\n                expressions.pop(\"is_latest\")\n                result = qs.filter(**expressions).order_by(\"-created_at\").first()\n                if result is not None:\n                    return result\n            raise e\n\n\nclass SQLRecordList(UserList, Generic[T]):\n    \"\"\"Is ordered, can't be queried, but has `.to_dataframe()`.\"\"\"\n\n    def __init__(self, records: Iterable[T]):\n        if isinstance(records, list):\n            self.data = records  # Direct assignment if already a list, no copy\n        else:\n            super().__init__(records)  # Let UserList handle the conversion\n\n    def to_dataframe(self) -> pd.DataFrame:\n        import pandas as pd\n\n        keys = get_keys_from_df(self.data, self.data[0].__class__)\n        values = [record.__dict__ for record in self.data]\n        return pd.DataFrame(values, columns=keys)\n\n    @deprecated(new_name=\"to_dataframe\")\n    def df(self) -> pd.DataFrame:\n        return self.to_dataframe()\n\n    def to_list(\n        self, field: str | None = None\n    ) -> list[str]:  # meaningful to be parallel with to_list() in QuerySet\n        if field is None:\n            return self.data\n        return [getattr(record, field) for record in self.data]\n\n    def one(self) -> T:\n        \"\"\"Exactly one result. Throws error if there are more or none.\"\"\"\n        return one_helper(self)\n\n    def save(self) -> SQLRecordList[T]:\n        \"\"\"Save all records to the database.\"\"\"\n        from lamindb.models.save import save\n\n        save(self)\n        return self\n\n\ndef get_basic_field_names(\n    qs: QuerySet,\n    include: list[str],\n    features_input: bool | list[str] | str,\n) -> list[str]:\n    exclude_field_names = [\"updated_at\"]\n    include_private_fields = False\n    if \"privates\" in include:\n        include_private_fields = True\n        include.remove(\"privates\")\n    field_names = [\n        field.name\n        for field in qs.model._meta.fields\n        if (\n            not isinstance(field, models.ForeignKey)\n            and field.name not in exclude_field_names\n            and (\n                not field.name.startswith(\"_\")\n                or include_private_fields\n                or (field.name == \"_dtype_str\" and qs.model.__name__ == \"Feature\")\n            )\n        )\n    ]\n    # TODO: harmonize with L1023 in sqlrecord.py\n    for field_name in [\n        \"version_tag\",\n        \"is_latest\",\n        \"is_locked\",\n        \"is_type\",\n        \"created_at\",\n        \"updated_at\",\n        \"created_on\",\n    ]:\n        if field_name in field_names:\n            field_names.append(field_names.pop(field_names.index(field_name)))\n    field_names += [\n        f\"{field.name}_id\"\n        for field in qs.model._meta.fields\n        if isinstance(field, models.ForeignKey)\n    ]\n    # move uid to first position if present\n    if \"uid\" in field_names:\n        field_names.insert(0, field_names.pop(field_names.index(\"uid\")))\n\n    # move primary key to second position if present\n    pk = qs.model._meta.pk.name if qs.model._meta.pk else None\n    if pk and pk in field_names:\n        field_names.insert(1, field_names.pop(field_names.index(pk)))\n    if (\n        include or features_input\n    ):  # if there is features_input, reduce fields to just the first 3\n        subset_field_names = field_names[:3]\n        intersection = set(field_names) & set(include)\n        subset_field_names += list(intersection)\n        field_names = subset_field_names\n    return field_names\n\n\ndef get_feature_annotate_kwargs(\n    registry: Registry,\n    features: bool | list[str] | str | None,\n    qs: QuerySet | None = None,\n) -> tuple[dict[str, Any], QuerySet, dict[str, Any]]:\n    from lamindb.models import (\n        Artifact,\n        Feature,\n        Record,\n        RecordJson,\n        Run,\n        ULabel,\n    )\n    from lamindb.models.feature import parse_dtype\n\n    if registry not in {Artifact, Record, Run}:\n        raise ValueError(\n            f'include=\"features\" is only applicable for Artifact, Record, and Run, not {registry.__name__}'\n        )\n\n    feature_ids = []\n    if features == \"queryset\":\n        ids_list = qs.values_list(\"id\", flat=True)\n        for obj in registry._meta.related_objects:\n            related_name_attr = getattr(registry, obj.related_name, None)\n            if related_name_attr is None or not hasattr(related_name_attr, \"through\"):\n                continue\n            link_model = related_name_attr.through\n            if (\n                not hasattr(link_model, \"feature\")\n                or link_model.__name__ == \"Record_parents\"\n            ):\n                continue\n            filter_field = registry.__name__.lower()\n            if not hasattr(link_model, filter_field):\n                potential_fields = []\n                for field in link_model._meta.get_fields():\n                    if field.is_relation and field.related_model is registry:\n                        potential_fields.append(field.name)\n                if len(potential_fields) == 1:\n                    filter_field = potential_fields[0]\n                else:\n                    continue\n            links = link_model.objects.using(qs.db).filter(\n                **{filter_field + \"_id__in\": ids_list}\n            )\n            feature_ids_for_link_model = links.values_list(\"feature__id\", flat=True)\n            feature_ids += feature_ids_for_link_model\n        if registry is Record:\n            # this request is not strictly necessary, but it makes the resulting reshaped\n            # dataframe consistent\n            feature_ids += RecordJson.filter(record_id__in=ids_list).values_list(\n                \"feature__id\", flat=True\n            )\n        feature_ids = list(set(feature_ids))  # remove duplicates\n\n    feature_qs = Feature.connect(None if qs is None else qs.db).filter(\n        _dtype_str__isnull=False\n    )\n    if isinstance(features, list):\n        feature_qs = feature_qs.filter(name__in=features)\n        if len(features) != feature_qs.count():\n            logger.warning(\n                f\"found features and passed features differ:\\n - passed: {features}\\n - found: {feature_qs.to_list('name')}\"\n            )\n    elif feature_ids:\n        feature_qs = feature_qs.filter(id__in=feature_ids)\n    else:\n        feature_qs = feature_qs.filter(\n            ~Q(_dtype_str__startswith=\"cat[\")\n            | Q(_dtype_str__startswith=\"cat[ULabel\")\n            | Q(_dtype_str__startswith=\"cat[Record\")\n        )\n        logger.important(\n            f\"queried for all categorical features of dtypes Record or ULabel and non-categorical features: ({len(feature_qs)}) {feature_qs.to_list('name')}\"\n        )\n    # Duplicate feature names map to ambiguous dataframe columns. We keep a single\n    # feature per name for query annotation and warn loudly to surface this.\n    feature_name_to_ids: dict[str, list[int]] = defaultdict(list)\n    for feature in feature_qs.order_by(\"id\"):\n        feature_name_to_ids[feature.name].append(feature.id)\n    duplicate_feature_names = {\n        name: ids for name, ids in feature_name_to_ids.items() if len(ids) > 1\n    }\n    if duplicate_feature_names:\n        logger.warning(\n            \"detected duplicate feature names while building dataframe features; \"\n            \"keeping the first feature per name by ascending id. \"\n            f\"duplicates: {duplicate_feature_names}\"\n        )\n        unique_feature_ids = [ids[0] for ids in feature_name_to_ids.values()]\n        feature_qs = feature_qs.filter(id__in=unique_feature_ids)\n    # Get the categorical features\n    cat_feature_types = {\n        parse_dtype(feature._dtype_str)[0][\"registry_str\"]\n        for feature in feature_qs\n        if feature._dtype_str.startswith(\"cat[\")\n        or feature._dtype_str.startswith(\"list[cat[\")\n    }\n    # fields to annotate\n    cat_feature_fields = defaultdict(list)\n    for feature in feature_qs:\n        dtype_str = feature._dtype_str\n        if dtype_str.startswith(\"cat[\") or dtype_str.startswith(\"list[cat[\"):\n            dtype_info = parse_dtype(dtype_str)[0]\n            registry_str = dtype_info[\"registry_str\"]\n            field_name = dtype_info[\"field_str\"]\n            cat_feature_fields[registry_str].append(field_name)\n    # Get relationships of labels and features\n    link_models_on_models = {\n        getattr(\n            registry, obj.related_name\n        ).through.__get_name_with_module__(): obj.related_model\n        for obj in registry._meta.related_objects\n        if obj.related_model.__get_name_with_module__() in cat_feature_types\n        and hasattr(getattr(registry, obj.related_name), \"through\")\n        and hasattr(getattr(registry, obj.related_name).through, \"feature_id\")\n    }\n    if registry is Artifact:\n        link_models_on_models[\"ArtifactULabel\"] = ULabel\n    elif registry is Record:\n        link_models_on_models[\"RecordRecord\"] = Record\n    link_attributes_on_models = {\n        obj.related_name: link_models_on_models[\n            obj.related_model.__get_name_with_module__()\n        ]\n        for obj in registry._meta.related_objects\n        if (\n            obj.related_model.__get_name_with_module__() in link_models_on_models\n            and (\n                not obj.related_name.startswith(\"links_record\")\n                if registry is Record\n                else True\n            )\n        )\n    }\n    # Prepare Django's annotate for features with filtering\n    filtered_relations = {}\n    annotate_kwargs = {}\n\n    for link_attr, feature_type_model in link_attributes_on_models.items():\n        feature_type = feature_type_model.__get_name_with_module__()\n        if link_attr == \"links_project\" and registry is Record:\n            # we're only interested in _values_project when \"annotating\" records\n            continue\n\n        # Determine field name\n        if registry in {Artifact, Run}:\n            field_name = (\n                feature_type.split(\".\")[1] if \".\" in feature_type else feature_type\n            ).lower()\n        else:\n            field_name = \"value\"\n\n        # Determine if this value model needs branch filtering\n        # Skip user relations (RecordUser, ArtifactUser don't have branch)\n        should_filter_branch = link_attr not in {\"values_user\", \"links_user\"}\n\n        # Create filtered relation for the value model\n        value_relation_path = f\"{link_attr}__{field_name}\"\n        filtered_value_relation_name = f\"filtered_{link_attr}_{field_name}\"\n\n        if should_filter_branch:\n            filtered_relations[filtered_value_relation_name] = FilteredRelation(\n                value_relation_path,\n                condition=Q(\n                    **{\n                        f\"{value_relation_path}__branch_id__in\": get_default_branch_ids()\n                    }\n                ),\n            )\n        else:\n            # No branch filtering needed\n            filtered_relations[filtered_value_relation_name] = FilteredRelation(\n                value_relation_path\n            )\n\n        # Add annotation for feature name (feature doesn't have branch_id)\n        annotate_kwargs[f\"{link_attr}__feature__name\"] = F(\n            f\"{link_attr}__feature__name\"\n        )\n\n        # Add annotations for categorical feature fields using the filtered relation\n        for field in cat_feature_fields[feature_type]:\n            annotate_kwargs[f\"{link_attr}__{field_name}__{field}\"] = F(\n                f\"{filtered_value_relation_name}__{field}\"\n            )\n\n    # Handle JSON values (no branch filtering needed)\n    json_values_attribute = (\n        \"json_values\" if registry in {Artifact, Run} else \"values_json\"\n    )\n    annotate_kwargs[f\"{json_values_attribute}__feature__name\"] = F(\n        f\"{json_values_attribute}__feature__name\"\n    )\n    annotate_kwargs[f\"{json_values_attribute}__value\"] = F(\n        f\"{json_values_attribute}__value\"\n    )\n\n    return annotate_kwargs, feature_qs, filtered_relations\n\n\n# https://claude.ai/share/16280046-6ae5-4f6a-99ac-dec01813dc3c\ndef analyze_lookup_cardinality(\n    model_class: SQLRecord, lookup_paths: list[str] | None\n) -> dict[str, str]:\n    \"\"\"Analyze lookup cardinality.\n\n    Analyzes Django model lookups to determine if they will result in\n    one-to-one or one-to-many relationships when used in annotations.\n\n    Args:\n        model_class: The Django model class to analyze\n        include: List of lookup paths (e.g. [\"created_by__name\", \"ulabels__name\"])\n\n    Returns:\n        Dictionary mapping lookup paths to either 'one' or 'many'\n    \"\"\"\n    result = {}  # type: ignore\n    if lookup_paths is None:\n        return result\n    for lookup_path in lookup_paths:\n        parts = lookup_path.split(\"__\")\n        current_model = model_class\n        is_many = False\n\n        # Walk through each part of the lookup path\n        for part in parts[:-1]:  # Exclude the last part as it's an attribute\n            field = None\n\n            # Handle reverse relations\n            for f in current_model._meta.get_fields():\n                if isinstance(f, ForeignObjectRel) and f.get_accessor_name() == part:\n                    field = f\n                    is_many = not f.one_to_one\n                    if hasattr(f, \"field\"):\n                        current_model = f.field.model\n                    break\n\n            # Handle forward relations\n            if field is None:\n                field = current_model._meta.get_field(part)\n                if isinstance(field, ManyToManyField):\n                    is_many = True\n                    current_model = field.remote_field.model\n                elif isinstance(field, ForeignKey):\n                    current_model = field.remote_field.model\n\n        result[lookup_path] = \"many\" if is_many else \"one\"\n\n    return result\n\n\ndef reorder_subset_columns_in_df(\n    df: pd.DataFrame, column_order: list[str], position=3\n) -> pd.DataFrame:\n    \"\"\"Reorder subset of columns in dataframe to specified position.\"\"\"\n    valid_columns = [col for col in column_order if col in df.columns]\n    all_cols = df.columns.tolist()\n    remaining_cols = [col for col in all_cols if col not in valid_columns]\n    new_order = remaining_cols[:position] + valid_columns + remaining_cols[position:]\n    return df[new_order]\n\n\ndef encode_lamindb_fields_as_columns(\n    registry: Registry, fields: str | list[str]\n) -> str | dict[str, str]:\n    \"\"\"Encode laminDB specific fields in dataframe with __lamindb_{model_name}_{field_name}__.\n\n    This is needed when reshaping dataframes with features to avoid conflicts between\n    laminDB fields and feature names.\n    \"\"\"\n\n    def encode(field: str) -> str:\n        return f\"__lamindb_{registry._meta.model_name}_{field}__\"\n\n    registry_field_names = {field.name for field in registry._meta.concrete_fields}\n\n    if isinstance(fields, str):\n        return encode(fields) if fields in registry_field_names else fields\n\n    return {field: encode(field) for field in fields if field in registry_field_names}\n\n\n# https://lamin.ai/laminlabs/lamindata/transform/BblTiuKxsb2g0003\n# https://claude.ai/chat/6ea2498c-944d-4e7a-af08-29e5ddf637d2\ndef reshape_annotate_result(\n    registry: Registry,\n    df: pd.DataFrame,\n    field_names: list[str],\n    cols_from_include: dict[str, str] | None,\n    feature_qs: QuerySet | None,\n) -> pd.DataFrame:\n    \"\"\"Reshapes tidy table to wide format.\n\n    Args:\n        registry: The registry model (e.g., Artifact)\n        df: Input dataframe with experimental data\n        field_names: List of basic fields to include in result\n        cols_from_include: Dict specifying additional columns to process with types\n            ('one' or 'many'), e.g., {'ulabels__name': 'many', 'created_by__name': 'one'}\n        feature_qs: QuerySet of features\n    \"\"\"\n    import pandas as pd\n\n    from lamindb.models import Artifact, Run\n\n    cols_from_include = cols_from_include or {}\n\n    # Initialize result with basic fields (need a copy since we're modifying it)\n    result = df[field_names].copy()\n    pk_name = registry._meta.pk.name\n\n    # ========== no features requested ==========\n    if feature_qs is None or not feature_qs.exists():\n        if cols_from_include:\n            result = process_cols_from_include(df, result, cols_from_include, pk_name)\n        return result.drop_duplicates(subset=[pk_name])\n\n    # ========== process features ==========\n\n    # Encode Django field names to avoid conflicts with feature names\n    fields_map = encode_lamindb_fields_as_columns(registry, df.columns)\n    df_encoded = df.rename(columns=fields_map)\n    result_encoded = result.rename(columns=fields_map)\n    pk_name_encoded = fields_map.get(pk_name)  # type: ignore\n\n    # --- Process JSON-stored feature values ---\n    json_values_attribute = (\n        \"json_values\" if registry in {Artifact, Run} else \"values_json\"\n    )\n    feature_name_col = f\"{json_values_attribute}__feature__name\"\n    feature_value_col = f\"{json_values_attribute}__value\"\n\n    if all(col in df_encoded.columns for col in [feature_name_col, feature_value_col]):\n        # Separate dict and non-dict values for different aggregation strategies\n        is_dict_or_list = df_encoded[feature_value_col].apply(\n            lambda x: isinstance(x, (dict, list))\n        )\n        dict_or_list_df = df_encoded[is_dict_or_list]\n        non_dict_or_list_df = df_encoded[~is_dict_or_list]\n\n        # Aggregate: sets for non-dict values, first for dict values\n        groupby_cols = [pk_name_encoded, feature_name_col]\n        non_dict_or_list_features = non_dict_or_list_df.groupby(groupby_cols)[\n            feature_value_col\n        ].agg(set)\n        dict_or_list_features = dict_or_list_df.groupby(groupby_cols)[\n            feature_value_col\n        ].agg(\"first\")\n\n        # Combine and pivot to wide format\n        combined_features = pd.concat(\n            [non_dict_or_list_features, dict_or_list_features]\n        )\n        feature_values = combined_features.unstack().reset_index()\n\n        if not feature_values.empty:\n            result_encoded = result_encoded.join(\n                feature_values.set_index(pk_name_encoded),\n                on=pk_name_encoded,\n            )\n\n    # --- Process categorical/linked features ---\n    links_prefix = \"links_\" if registry in {Artifact, Run} else (\"links_\", \"values_\")\n    links_features = [\n        col\n        for col in df.columns\n        if \"feature__name\" in col and col.startswith(links_prefix)\n    ]\n\n    if links_features:\n        result_encoded = process_links_features(\n            df_encoded,\n            result_encoded,\n            links_features,\n            feature_qs,\n            pk_name_encoded,\n        )\n\n    # --- Apply type conversions based on feature metadata ---\n    def extract_and_check_scalar(series: pd.Series) -> tuple[pd.Series, bool]:\n        \"\"\"Extract single elements and return if column is now scalar.\"\"\"\n        has_multiple_values = False\n\n        def extract_and_track(value):\n            nonlocal has_multiple_values\n            if not hasattr(value, \"__len__\") or isinstance(value, str):\n                return value\n            if len(value) != 1:\n                has_multiple_values = True\n                return value\n            return next(iter(value))\n\n        extracted = series.apply(extract_and_track)\n        is_scalar = not has_multiple_values\n        return extracted, is_scalar\n\n    for feature in feature_qs:\n        if feature.name not in result_encoded.columns:\n            continue\n\n        result_encoded[feature.name], is_scalar = extract_and_check_scalar(\n            result_encoded[feature.name]\n        )\n\n        if is_scalar:\n            dtype_str = feature._dtype_str\n            if dtype_str.startswith(\"cat\"):\n                result_encoded[feature.name] = result_encoded[feature.name].astype(\n                    \"category\"\n                )\n            if dtype_str == \"datetime\":\n                # format and utc args are needed for mixed data\n                # pandera expects timezone-naive datetime objects, and hence,\n                # we need to localize with None\n                result_encoded[feature.name] = pd.to_datetime(\n                    result_encoded[feature.name], format=\"ISO8601\", utc=True\n                ).dt.tz_localize(None)\n            if dtype_str == \"date\":\n                # see comments for datetime\n                result_encoded[feature.name] = (\n                    pd.to_datetime(\n                        result_encoded[feature.name],\n                        format=\"ISO8601\",\n                        utc=True,\n                    )\n                    .dt.tz_localize(None)\n                    .dt.date\n                )\n            if dtype_str == \"bool\":\n                result_encoded[feature.name] = result_encoded[feature.name].astype(\n                    \"boolean\"\n                )\n\n        dtype_str = feature._dtype_str\n        if dtype_str.startswith(\"list\"):\n            mask = result_encoded[feature.name].notna()\n            result_encoded.loc[mask, feature.name] = result_encoded.loc[\n                mask, feature.name\n            ].apply(lambda x: list(x) if isinstance(x, (set, list)) else [x])\n\n        if dtype_str == \"dict\":\n            # this is the case when a dict is stored as a string; won't happen\n            # within lamindb but might for external data\n            if isinstance(result_encoded[feature.name].iloc[0], str):\n                result_encoded[feature.name] = result_encoded[feature.name].apply(\n                    lambda x: ast.literal_eval(x) if isinstance(x, str) else x\n                )\n\n    # --- Finalize result ---\n\n    # Reorder columns to prioritize features\n    result_encoded = reorder_subset_columns_in_df(\n        result_encoded,\n        feature_qs.to_list(\"name\"),  # type: ignore\n    )\n\n    # Process additional included columns\n    if cols_from_include:\n        cols_from_include_encoded = {\n            fields_map.get(k, k): v  # type: ignore\n            for k, v in cols_from_include.items()\n        }\n        result_encoded = process_cols_from_include(\n            df_encoded, result_encoded, cols_from_include_encoded, pk_name_encoded\n        )\n\n    # Decode field names back to original, except where conflicts exist\n    # (e.g., if a feature is also named 'id', keep the encoded field name)\n    decode_map = {\n        encoded: original\n        for original, encoded in fields_map.items()  # type: ignore\n        if original not in result_encoded.columns\n    }\n\n    return result_encoded.drop_duplicates(subset=[pk_name_encoded]).rename(\n        columns=decode_map\n    )\n\n\ndef process_links_features(\n    df: pd.DataFrame,\n    result: pd.DataFrame,\n    feature_cols: list[str],\n    feature_qs: QuerySet | None,\n    pk_name: str = \"id\",\n) -> pd.DataFrame:\n    \"\"\"Process links_XXX feature columns.\"\"\"\n    import pandas as pd\n\n    from lamindb.models.feature import parse_dtype\n\n    # this loops over different entities that might be linked under a feature\n    for feature_col in feature_cols:\n        links_attribute = \"links_\" if feature_col.startswith(\"links_\") else \"values_\"\n        regex = f\"{links_attribute}(.+?)__feature__name\"\n        prefix = re.match(regex, feature_col).group(1)\n\n        value_cols = [\n            col\n            for col in df.columns\n            if col.startswith(f\"{links_attribute}{prefix}__\")\n            and \"feature__name\" not in col\n        ]\n\n        if not value_cols:\n            continue\n\n        value_col = value_cols[0]\n        feature_names = df[feature_col].unique()\n        feature_names = feature_names[~pd.isna(feature_names)]\n\n        for feature in feature_qs:\n            if feature.name not in feature_names:\n                continue\n            if feature.name in result.columns:\n                continue\n            field_name = parse_dtype(feature._dtype_str)[0][\"field_str\"]\n            value_col = [c for c in value_cols if c.endswith(f\"__{field_name}\")][0]\n            mask = (df[feature_col] == feature.name) & df[value_col].notna()\n            feature_values = df[mask].groupby(pk_name)[value_col].agg(set)\n            result.insert(3, feature.name, result[pk_name].map(feature_values))\n\n    return result\n\n\ndef process_cols_from_include(\n    df: pd.DataFrame,\n    result: pd.DataFrame,\n    extra_columns: dict[str, str],\n    pk_name: str = \"id\",\n) -> pd.DataFrame:\n    \"\"\"Process additional columns based on their specified types.\"\"\"\n    for col, col_type in extra_columns.items():\n        if col not in df.columns:\n            continue\n        if col in result.columns:\n            continue\n\n        values = df.groupby(pk_name)[col].agg(set if col_type == \"many\" else \"first\")\n        result.insert(3, col, result[pk_name].map(values))\n\n    return result\n\n\ndef _queryset_class_factory(\n    registry: Registry, queryset_cls: type[models.QuerySet]\n) -> type[models.QuerySet]:\n    from lamindb.models import Artifact, ArtifactSet\n\n    # If the model is Artifact, create a new class for BasicQuerySet or QuerySet that inherits from ArtifactSet.\n    # This allows to add artifact specific functionality to all classes inheriting from BasicQuerySet.\n    # Thus all query sets of artifacts (and only of artifacts) will have functions from ArtifactSet.\n    if registry is Artifact and not issubclass(queryset_cls, ArtifactSet):\n        new_cls = type(\n            \"Artifact\" + queryset_cls.__name__, (queryset_cls, ArtifactSet), {}\n        )\n    else:\n        new_cls = queryset_cls\n    return new_cls\n\n\nclass BasicQuerySet(models.QuerySet):\n    \"\"\"Sets of records returned by queries.\n\n    See Also:\n\n        `django QuerySet <https://docs.djangoproject.com/en/stable/ref/models/querysets/>`__\n\n    Examples:\n\n        Any filter statement produces a query set::\n\n            queryset = Registry.filter(name__startswith=\"keyword\")\n    \"\"\"\n\n    def __new__(cls, model=None, query=None, using=None, hints=None):\n        # see comments in _queryset_class_factory\n        return object.__new__(_queryset_class_factory(model, cls))\n\n    def _to_class(\n        self, cls: type[models.QuerySet], copy: bool = True\n    ) -> models.QuerySet:\n        qs = self.all() if copy else self\n        qs.__class__ = cls\n        return qs\n\n    def _to_basic(self, copy: bool = True) -> BasicQuerySet:\n        cls = _queryset_class_factory(self.model, BasicQuerySet)\n        return self._to_class(cls, copy)\n\n    def _to_non_basic(self, copy: bool = True) -> QuerySet:\n        cls = _queryset_class_factory(self.model, QuerySet)\n        return self._to_class(cls, copy)\n\n    @doc_args(SQLRecord.to_dataframe.__doc__)\n    def to_dataframe(\n        self,\n        *,\n        include: str | list[str] | None = None,\n        features: str | list[str] | None = None,\n        limit: int | None = 100,\n        order_by: str | None = \"-id\",\n    ) -> pd.DataFrame:\n        \"\"\"{}\"\"\"  # noqa: D415\n        import pandas as pd\n\n        if (\n            self.model.__name__ == \"Artifact\"\n            and \"kind\" not in str(self.query.where)\n            and self.query.low_mark\n            == 0  # this should be 0, not None, it represent OFFSET = 0\n            and self.query.high_mark\n            is None  # this should be None, it represent _no_ LIMIT\n        ):\n            subset = self.exclude(**{\"kind__startswith\": \"__lamindb\"})\n        else:\n            subset = self\n        # check if queryset is already ordered\n        is_ordered = bool(subset.query.order_by)\n        # Only apply order_by if not already ordered and order_by is specified\n        if not is_ordered and order_by is not None:\n            subset = subset.order_by(order_by)\n        is_truncated = False\n        if limit is not None:\n            # Fetch one extra row as a sentinel to detect truncation without count().\n            subset = subset[: limit + 1]\n        if include is None:\n            include_input = []\n        elif isinstance(include, str):\n            include_input = [include]\n        else:\n            include_input = include\n        if \"features\" in include_input:\n            include_input.remove(\"features\")\n            if features is None:\n                # indicate the default features with True\n                # should refactor this in the future\n                features = True  # type: ignore\n        features_input = [] if features is None else features\n        include = get_backward_compat_filter_kwargs(subset, include_input)\n        field_names = get_basic_field_names(subset, include_input, features_input)\n\n        annotate_kwargs = {}\n        filtered_relations = {}  # type: ignore\n        feature_qs = None\n        if features:\n            feature_annotate_kwargs, feature_qs, filtered_relations = (\n                get_feature_annotate_kwargs(subset.model, features, subset)\n            )\n            annotate_kwargs.update(feature_annotate_kwargs)\n        if include_input:\n            include_input = include_input.copy()[::-1]  # type: ignore\n            include_kwargs = {s: F(s) for s in include_input if s not in field_names}\n            annotate_kwargs.update(include_kwargs)\n        if annotate_kwargs:\n            id_subquery = subset.values(\"id\")\n            # for annotate, we want the queryset without filters so that joins don't affect the annotations\n            query_set_without_filters = subset.model.objects.using(subset.db).filter(\n                id__in=Subquery(id_subquery)\n            )\n            if subset.query.order_by:\n                # Apply the same ordering to the new queryset\n                query_set_without_filters = query_set_without_filters.order_by(\n                    *subset.query.order_by\n                )\n            if filtered_relations:\n                query_set_without_filters = query_set_without_filters.annotate(\n                    **filtered_relations\n                )\n            queryset = query_set_without_filters.annotate(**annotate_kwargs)\n        else:\n            queryset = subset\n\n        # our main problem with this approach is that we lose ordering in categorical lists\n        # we'd need to respect ordering through the primary key on the link table, but that's\n        # another refactoring effort\n        # we have the correct ordering in `features.get_values()`, though\n        df = pd.DataFrame(queryset.values(*field_names, *list(annotate_kwargs.keys())))\n        if limit is not None and len(df) > limit:\n            is_truncated = True\n            df = df.iloc[:limit].copy()\n        if len(df) == 0:\n            df = pd.DataFrame({}, columns=field_names)\n            return df\n        cols_from_include = analyze_lookup_cardinality(self.model, include_input)  # type: ignore\n        df_reshaped = reshape_annotate_result(\n            self.model, df, field_names, cols_from_include, feature_qs\n        )\n        pk_name = self.model._meta.pk.name\n        encoded_pk_name = encode_lamindb_fields_as_columns(self.model, pk_name)\n        if encoded_pk_name in df_reshaped.columns:\n            df_reshaped = df_reshaped.set_index(encoded_pk_name)\n        else:\n            pk_column_name = pk_name if pk_name in df.columns else f\"{pk_name}_id\"\n            if pk_column_name in df_reshaped.columns:\n                df_reshaped = df_reshaped.set_index(pk_column_name)\n\n        # cast floats and ints where appropriate\n        # this is currently needed because the UI writes into the JSON field through JS\n        # and thus a `10` might be a float, not an int\n        # note: also type casting within reshape_annotate_result\n        if feature_qs is not None:\n            for feature in feature_qs:\n                if feature.name in df_reshaped.columns:\n                    current_dtype = df_reshaped[feature.name].dtype\n                    dtype_str = feature._dtype_str\n                    if dtype_str == \"int\" and not pd.api.types.is_integer_dtype(\n                        current_dtype\n                    ):\n                        df_reshaped[feature.name] = df_reshaped[feature.name].astype(\n                            \"Int64\"  # nullable integer dtype\n                        )\n                    elif dtype_str == \"float\" and not pd.api.types.is_float_dtype(\n                        current_dtype\n                    ):\n                        df_reshaped[feature.name] = df_reshaped[feature.name].astype(\n                            float\n                        )\n        if is_truncated:\n            logger.warning(\n                f\"truncated query result to limit={limit} {self.model.__name__} objects\"\n            )\n        return df_reshaped\n\n    @deprecated(new_name=\"to_dataframe\")\n    def df(\n        self,\n        include: str | list[str] | None = None,\n        features: bool | list[str] | str | None = None,\n    ) -> pd.DataFrame:\n        return self.to_dataframe(include=include, features=features)\n\n    def describe(self, return_str: bool = False) -> str | None:\n        \"\"\"Describe the query set to learn about available fields.\"\"\"\n        return self.model.describe(return_str=return_str)\n\n    def delete(self, *args, permanent: bool | None = None, **kwargs):\n        \"\"\"Delete all records in the query set.\n\n        Args:\n            permanent: Whether to permanently delete the record (skips trash).\n                Is only relevant for records that have the `branch` field.\n                If `None`, uses soft delete for records that have the `branch` field, hard delete otherwise.\n\n        Note:\n            Calling `delete()` twice on the same queryset does NOT permanently delete in bulk operations.\n            Use `permanent=True` for actual deletion.\n\n        Examples:\n\n            For a `QuerySet` object `qs`, call::\n\n                qs.delete()\n        \"\"\"\n        from lamindb.models import Artifact, Collection, Run, Storage, Transform\n\n        if self.model is Run:\n            if permanent is True:\n                from .run import _permanent_delete_runs\n\n                _permanent_delete_runs(self)\n                return\n            if permanent is not True:\n                self.update(branch_id=-1)\n                return\n        if self.model is Transform:\n            if permanent is True:\n                from .transform import _permanent_delete_transforms\n\n                _permanent_delete_transforms(self)\n                return\n            if permanent is not True:\n                _adjust_is_latest_when_deleting_is_versioned(self)\n                self.update(branch_id=-1, is_latest=False)\n                return\n        # Artifact, Collection: non-trivial delete behavior, handle in a loop\n        if self.model in {Artifact, Collection}:\n            for record in self:\n                record.delete(*args, permanent=permanent, **kwargs)\n        elif self.model is Storage:  # storage does not have soft delete\n            if permanent is False:\n                raise ValueError(\n                    \"Soft delete is not possible for Storage, \"\n                    \"use 'permanent=True' or 'permanent=None' for permanent deletion.\"\n                )\n            for record in self:\n                record.delete()\n        else:\n            if not permanent and hasattr(self.model, \"branch_id\"):\n                logger.warning(\"moved records to trash (branch_id = -1)\")\n                self.update(branch_id=-1)\n            else:\n                if permanent is False:\n                    raise ValueError(\n                        f\"Soft delete is not possible for {self.model.__name__}, \"\n                        \"use 'permanent=True' for permanent deletion.\"\n                    )\n                super().delete(*args, **kwargs)\n\n    def to_list(self, field: str | None = None) -> list[SQLRecord] | list[str]:\n        \"\"\"Populate an (unordered) list with the results.\n\n        Note that the order in this list is only meaningful if you ordered the underlying query set with `.order_by()`.\n\n        Examples::\n\n            queryset.to_list()  # list of records\n            queryset.to_list(\"name\")  # list of values\n        \"\"\"\n        if field is None:\n            return list(self)\n        else:\n            # list casting is necessary because values_list does not return a list\n            return list(self.values_list(field, flat=True))\n\n    def first(self) -> SQLRecord | None:\n        \"\"\"If non-empty, the first result in the query set, otherwise ``None``.\n\n        Examples::\n\n            queryset.first()\n        \"\"\"\n        if len(self) == 0:\n            return None\n        return self[0]\n\n    def one(self) -> SQLRecord:\n        \"\"\"Exactly one result. Raises error if there are more or none.\"\"\"\n        return one_helper(self)\n\n    def one_or_none(self) -> SQLRecord | None:\n        \"\"\"At most one result. Returns it if there is one, otherwise returns ``None``.\n\n        Examples::\n\n            ULabel.filter(name=\"benchmark\").one_or_none()\n            ULabel.filter(name=\"non existing label\").one_or_none()\n        \"\"\"\n        return one_helper(self, raise_doesnotexist=False)\n\n    @doc_args(_search.__doc__)\n    def search(self, string: str, **kwargs):\n        \"\"\"{}\"\"\"  # noqa: D415\n        return _search(cls=self, string=string, **kwargs)\n\n    @doc_args(_lookup.__doc__)\n    def lookup(self, field: StrField | None = None, **kwargs) -> NamedTuple:\n        \"\"\"{}\"\"\"  # noqa: D415\n        return _lookup(cls=self, field=field, **kwargs)\n\n    # -------------------------------------------------------------------------------------\n    # CanCurate\n    # -------------------------------------------------------------------------------------\n\n    @doc_args(CanCurate.validate.__doc__)\n    def validate(self, values: ListLike, field: str | StrField | None = None, **kwargs):\n        \"\"\"{}\"\"\"  # noqa: D415\n        return _validate(cls=self, values=values, field=field, **kwargs)\n\n    @doc_args(CanCurate.inspect.__doc__)\n    def inspect(self, values: ListLike, field: str | StrField | None = None, **kwargs):\n        \"\"\"{}\"\"\"  # noqa: D415\n        return _inspect(cls=self, values=values, field=field, **kwargs)\n\n    @doc_args(CanCurate.standardize.__doc__)\n    def standardize(\n        self, values: Iterable, field: str | StrField | None = None, **kwargs\n    ):\n        \"\"\"{}\"\"\"  # noqa: D415\n        return _standardize(cls=self, values=values, field=field, **kwargs)\n\n\n# this differs from BasicQuerySet only in .filter and .get\n# QueryManager returns BasicQuerySet because it is problematic to redefine .filter and .get\n# for a query set used by the default manager\nclass QuerySet(BasicQuerySet):\n    \"\"\"Sets of records returned by queries.\n\n    Implements additional filtering capabilities.\n\n    See Also:\n\n        `django QuerySet <https://docs.djangoproject.com/en/4.2/ref/models/querysets/>`__\n\n    Examples:\n\n        >>> ULabel(name=\"my label\").save()\n        >>> queryset = ULabel.filter(name=\"my label\")\n        >>> queryset # an instance of QuerySet\n    \"\"\"\n\n    def _handle_unknown_field(self, error: FieldError) -> None:\n        \"\"\"Suggest available fields if an unknown field was passed.\"\"\"\n        if \"Cannot resolve keyword\" in str(error):\n            field = str(error).split(\"'\")[1]\n            avail_fields = self.model.__get_available_fields__()\n            fields = \", \".join(sorted(avail_fields))\n            raise FieldError(\n                f\"Unknown field '{field}'. Available fields: {fields}\"\n            ) from None\n        raise error  # pragma: no cover\n\n    def get(self, idlike: int | str | None = None, **expressions) -> SQLRecord:\n        \"\"\"Query a single record. Raises error if there are more or none.\"\"\"\n        is_run_input = expressions.pop(\"is_run_input\", False)\n\n        # artifacts_from_path and get accept only BasicQuerySet\n        qs = self._to_class(BasicQuerySet, copy=True)\n\n        if path := expressions.pop(\"path\", None):\n            from .artifact_set import ArtifactSet, artifacts_from_path\n\n            if not isinstance(self, ArtifactSet):\n                raise ValueError(\"Querying by path is only possible for artifacts.\")\n            qs = artifacts_from_path(qs, path)\n\n        try:\n            record = get(qs, idlike, **expressions)\n        except ValueError as e:\n            # Pass through original error for explicit id lookups\n            if \"Field 'id' expected a number\" in str(e):\n                if \"id\" in expressions:\n                    raise\n                field = next(iter(expressions))\n                raise FieldError(\n                    f\"Invalid lookup '{expressions[field]}' for {field}. Did you mean {field}__name?\"\n                ) from None\n            raise  # pragma: no cover\n        except FieldError as e:\n            self._handle_unknown_field(e)\n            raise  # pragma: no cover\n\n        if is_run_input is not False:  # might be None or True or Run\n            from .artifact import Artifact, track_run_input\n            from .collection import Collection\n\n            if isinstance(record, (Artifact, Collection)):\n                track_run_input(record, is_run_input)\n\n        return record\n\n    def filter(self, *queries, **expressions) -> QuerySet:\n        \"\"\"Query a set of records.\"\"\"\n        from lamindb.models import Artifact, Record, Run\n\n        from .feature import FeaturePredicate\n\n        feature_predicates = [q for q in queries if isinstance(q, FeaturePredicate)]\n        queries = tuple(q for q in queries if not isinstance(q, FeaturePredicate))\n        registry = self.model\n        is_status_filter_on_run = registry is Run and any(\n            key.split(\"__\")[0] == \"status\" for key in expressions\n        )\n        can_filter_with_features = registry in {\n            Artifact,\n            Run,\n            Record,\n        }\n        if (\n            not expressions.pop(\"_skip_filter_with_features\", False)\n            and can_filter_with_features\n            and not is_status_filter_on_run\n        ):\n            from ._feature_manager import filter_with_features\n\n            qs = filter_with_features(self, *queries, **expressions)\n        else:\n            # Suggest to use __name for related fields such as id when not passed\n            for field, value in expressions.items():\n                if (\n                    isinstance(value, str)\n                    and value.strip(\"-\").isalpha()\n                    and \"__\" not in field\n                    and hasattr(registry, field)\n                ):\n                    field_attr = getattr(registry, field)\n                    if hasattr(field_attr, \"field\") and field_attr.field.related_model:\n                        raise FieldError(\n                            f\"Invalid lookup '{value}' for {field}. Did you mean {field}__name?\"\n                        )\n            expressions = process_expressions(self, queries, expressions)\n            # need to run a query if queries or expressions are not empty\n            if queries or expressions:\n                try:\n                    qs = super().filter(*queries, **expressions)\n                except FieldError as e:\n                    self._handle_unknown_field(e)\n            else:\n                qs = self\n        if feature_predicates:\n            if not can_filter_with_features:\n                raise FieldError(\n                    f\"Feature predicates are only supported for Artifact, Run, and Record, not {registry.__name__}.\"\n                )\n            from ._feature_manager import filter_with_feature_predicates\n\n            # Run predicate translation on a BasicQuerySet clone.\n            # - `copy=True` avoids mutating `qs.__class__` in place while we temporarily\n            #   switch query set type for this translation phase.\n            # - We intentionally do not use `_skip_filter_with_features` here: that flag\n            #   guards the QuerySet.filter() feature dispatcher path, while this code\n            #   bypasses that dispatcher and executes predicate translation directly.\n            qs = filter_with_feature_predicates(\n                qs._to_class(BasicQuerySet, copy=True), feature_predicates\n            )._to_class(type(qs), copy=False)\n        return qs\n\n\n@final\nclass NonInstantiableQuerySet:\n    \"\"\"Wrapper around QuerySet that prevents instantiation while preserving query methods.\"\"\"\n\n    def __init__(self, qs: QuerySet, registry_name: str):\n        self._qs = qs\n        self._name = registry_name\n\n    def __repr__(self) -> str:\n        return f\"<QuerySet [{self._name}]>\"\n\n    def __call__(self, *args, **kwargs):\n        raise TypeError(\n            f\"Cannot instantiate {self._name} from DB. \"\n            f\"Use {self._name}.filter(), {self._name}.get(), etc. to query records.\"\n        )\n\n    def __getattr__(self, attr):\n        return getattr(self._qs, attr)\n\n\nclass ModuleNamespace:\n    \"\"\"Namespace for accessing registries from a specific schema module.\n\n    Args:\n        query_db: Parent DB instance.\n        module_name: Name of the schema module (e.g., 'bionty', 'pertdb').\n    \"\"\"\n\n    def __init__(self, query_db: DB, module_name: str):\n        self._query_db = query_db\n        self._module_name = module_name\n        self._cache: dict[str, NonInstantiableQuerySet] = {}\n\n    def __getattr__(self, name: str) -> NonInstantiableQuerySet:\n        \"\"\"Access a registry class from this schema module.\n\n        Args:\n            name: Registry class name (e.g., 'Gene', 'CellType').\n\n        Returns:\n            QuerySet for the specified registry scoped to the parent instance.\n        \"\"\"\n        if name in self._cache:\n            return self._cache[name]\n\n        try:\n            schema_module = import_module(self._module_name)\n            if hasattr(schema_module, name):\n                model_class = getattr(schema_module, name)\n                queryset = model_class.connect(self._query_db._instance)\n                wrapped = NonInstantiableQuerySet(queryset, name)\n                self._cache[name] = wrapped\n                return wrapped\n        except (ImportError, AttributeError):\n            pass\n\n        raise AttributeError(\n            f\"Registry '{name}' not found in lamindb. Use .bt.{name} or .pertdb.{name} for schema-specific registries.\"\n        )\n\n    def __dir__(self) -> list[str]:\n        \"\"\"Return list of available registries in this schema module.\"\"\"\n        base_attrs = [attr for attr in object.__dir__(self) if not attr.startswith(\"_\")]\n        try:\n            schema_module = import_module(self._module_name)\n            if hasattr(schema_module, \"__all__\"):\n                registries = set()\n                for class_name in schema_module.__all__:\n                    model_class = getattr(schema_module, class_name, None)\n                    if model_class and hasattr(model_class, \"connect\"):\n                        registries.add(class_name)\n                return sorted(set(base_attrs) | registries)\n        except ImportError:\n            pass\n        return base_attrs\n\n\nclass BiontyDB(ModuleNamespace):\n    \"\"\"Namespace for Bionty registries (Gene, CellType, Disease, etc.).\"\"\"\n\n    Gene: QuerySet[Gene]  # type: ignore[type-arg]\n    Protein: QuerySet[Protein]  # type: ignore[type-arg]\n    CellType: QuerySet[CellType]  # type: ignore[type-arg]\n    Disease: QuerySet[Disease]  # type: ignore[type-arg]\n    Phenotype: QuerySet[Phenotype]  # type: ignore[type-arg]\n    Pathway: QuerySet[Pathway]  # type: ignore[type-arg]\n    Tissue: QuerySet[Tissue]  # type: ignore[type-arg]\n    CellLine: QuerySet[CellLine]  # type: ignore[type-arg]\n    CellMarker: QuerySet[CellMarker]  # type: ignore[type-arg]\n    Organism: QuerySet[Organism]  # type: ignore[type-arg]\n    ExperimentalFactor: QuerySet[ExperimentalFactor]  # type: ignore[type-arg]\n    DevelopmentalStage: QuerySet[DevelopmentalStage]  # type: ignore[type-arg]\n    Ethnicity: QuerySet[Ethnicity]  # type: ignore[type-arg]\n\n\nclass PertdbDB(ModuleNamespace):\n    \"\"\"Namespace for `PertDB` registries (Biologic, Compound, etc.).\"\"\"\n\n    Biologic: QuerySet[Biologic]  # type: ignore[type-arg]\n    Compound: QuerySet[Compound]  # type: ignore[type-arg]\n    CompoundPerturbation: QuerySet[CompoundPerturbation]  # type: ignore[type-arg]\n    GeneticPerturbation: QuerySet[GeneticPerturbation]  # type: ignore[type-arg]\n    EnvironmentalPerturbation: QuerySet[EnvironmentalPerturbation]  # type: ignore[type-arg]\n    CombinationPerturbation: QuerySet[CombinationPerturbation]  # type: ignore[type-arg]\n    PerturbationTarget: QuerySet[PerturbationTarget]  # type: ignore[type-arg]\n\n\nclass DB:\n    \"\"\"Query any registry of any instance.\n\n    Args:\n        instance: Instance identifier in format \"account/instance\".\n\n    Examples:\n\n        Query objects from an instance::\n\n            db = ln.DB(\"laminlabs/cellxgene\")\n\n        Query artifacts and filter by `suffix`::\n\n            db.Artifact.filter(suffix=\".h5ad\").to_dataframe()\n\n        Get a single artifact by uid::\n\n            artifact = db.Artifact.get(\"abcDEF123456\")\n\n        Query records and filter by name::\n\n            db.Record.filter(name__startswith=\"sample\").to_dataframe()\n\n        Get a cell type object::\n\n            t_cell = db.bionty.CellType.get(name=\"T cell\")\n\n        Create a lookup object to auto-complete all cell types in the database::\n\n            cell_types = db.bionty.CellType.lookup()\n\n        Return a `DataFrame` with additional info::\n\n            db.Artifact.filter(\n                suffix=\".h5ad\",\n                description__contains=\"immune\",\n                size__gt=1e9,  # size > 1GB\n                cell_types__name__in=[\"B cell\", \"T cell\"],\n            ).order_by(\"created_at\").to_dataframe(\n                include=[\"cell_types__name\", \"created_by__handle\"]  # include additional info\n            ).head()\n    \"\"\"\n\n    Artifact: QuerySet[Artifact]  # type: ignore[type-arg]\n    Collection: QuerySet[Collection]  # type: ignore[type-arg]\n    Transform: QuerySet[Transform]  # type: ignore[type-arg]\n    Run: QuerySet[Run]  # type: ignore[type-arg]\n    User: QuerySet[User]  # type: ignore[type-arg]\n    Storage: QuerySet[Storage]  # type: ignore[type-arg]\n    Feature: QuerySet[Feature]  # type: ignore[type-arg]\n    ULabel: QuerySet[ULabel]  # type: ignore[type-arg]\n    Record: QuerySet[Record]  # type: ignore[type-arg]\n    Schema: QuerySet[Schema]  # type: ignore[type-arg]\n    Project: QuerySet[Project]  # type: ignore[type-arg]\n    Reference: QuerySet[Reference]  # type: ignore[type-arg]\n    Branch: QuerySet[Branch]  # type: ignore[type-arg]\n    Space: QuerySet[Space]  # type: ignore[type-arg]\n\n    bionty: BiontyDB\n    pertdb: PertdbDB\n\n    def __init__(self, instance: str):\n        self._instance = instance\n        self._cache: dict[str, NonInstantiableQuerySet | BiontyDB | PertdbDB] = {}\n        self._available_registries: set[str] | None = None\n\n        owner, instance_name = (\n            ln_setup._connect_instance.get_owner_name_from_identifier(instance)\n        )\n        instance_info = ln_setup._connect_instance._connect_instance(\n            owner=owner, name=instance_name\n        )\n        self._modules = [\"lamindb\"] + list(instance_info.modules)\n\n    def __getattr__(self, name: str) -> NonInstantiableQuerySet | BiontyDB | PertdbDB:\n        \"\"\"Access a registry class or schema namespace for this database instance.\n\n        Args:\n            name: Registry class name (e.g., 'Artifact', 'Collection') or schema namespace ('bionty', 'pertdb').\n\n        Returns:\n            QuerySet for the specified registry or schema namespace scoped to this instance.\n        \"\"\"\n        if name in self._cache:\n            return self._cache[name]\n\n        if name == \"bionty\":\n            if \"bionty\" not in self._modules:\n                raise AttributeError(\n                    f\"Schema 'bionty' not available in instance '{self._instance}'.\"\n                )\n            if \"bionty\" not in self._cache:\n                namespace = BiontyDB(self, \"bionty\")\n                self._cache[\"bionty\"] = namespace\n            return self._cache[\"bionty\"]\n\n        if name == \"pertdb\":\n            if \"pertdb\" not in self._modules:\n                raise AttributeError(\n                    f\"Schema 'pertdb' not available in instance '{self._instance}'.\"\n                )\n            if \"pertdb\" not in self._cache:\n                namespace = PertdbDB(self, \"pertdb\")  # type: ignore\n                self._cache[\"pertdb\"] = namespace\n            return self._cache[\"pertdb\"]\n\n        try:\n            lamindb_module = import_module(\"lamindb\")\n            if hasattr(lamindb_module, name):\n                model_class = getattr(lamindb_module, name)\n                queryset = model_class.connect(self._instance)\n                wrapped = NonInstantiableQuerySet(queryset, name)\n                self._cache[name] = wrapped\n                return wrapped\n        except (ImportError, AttributeError):\n            pass\n\n        raise AttributeError(\n            f\"Registry '{name}' not found in lamindb core registries. Use .bionty.{name} or .pertdb.{name} for schema-specific registries.\"\n        )\n\n    def __repr__(self) -> str:\n        return f\"DB('{self._instance}')\"\n\n    def __dir__(self) -> list[str]:\n        \"\"\"Return list of available registries and schema namespaces.\"\"\"\n        base_attrs = [attr for attr in super().__dir__() if not attr.startswith(\"_\")]\n\n        lamindb_registries = set()\n        try:\n            lamindb_module = import_module(\"lamindb\")\n            if hasattr(lamindb_module, \"__all__\"):\n                for class_name in lamindb_module.__all__:\n                    model_class = getattr(lamindb_module, class_name, None)\n                    if model_class and hasattr(model_class, \"connect\"):\n                        lamindb_registries.add(class_name)\n        except ImportError:\n            pass\n\n        module_namespaces = set()\n        if \"bionty\" in self._modules:\n            module_namespaces.add(\"bionty\")\n        if \"pertdb\" in self._modules:\n            module_namespaces.add(\"pertdb\")\n\n        return sorted(set(base_attrs) | lamindb_registries | module_namespaces)\n"
  },
  {
    "path": "lamindb/models/record.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any, overload\n\nimport pgtrigger\nfrom django.conf import settings as django_settings\nfrom django.db import models\nfrom django.db.models import CASCADE, PROTECT\nfrom lamin_utils import logger\n\nfrom lamindb.base.fields import (\n    CharField,\n    DateTimeField,\n    ForeignKey,\n    JSONField,\n    TextField,\n)\nfrom lamindb.base.utils import class_and_instance_method, strict_classmethod\nfrom lamindb.errors import FieldValidationError\n\nfrom ..base.uids import base62_16\nfrom .artifact import Artifact\nfrom .can_curate import CanCurate\nfrom .collection import Collection\nfrom .feature import Feature, convert_to_pandas_dtype\nfrom .has_parents import HasParents, _query_relatives\nfrom .query_set import (\n    QuerySet,\n    encode_lamindb_fields_as_columns,\n    get_default_branch_ids,\n    reorder_subset_columns_in_df,\n)\nfrom .run import Run, TracksRun, TracksUpdates, User, current_run, current_user_id\nfrom .sqlrecord import BaseSQLRecord, HasType, IsLink, SQLRecord, _get_record_kwargs\nfrom .transform import Transform\nfrom .ulabel import ULabel\n\nif TYPE_CHECKING:\n    from datetime import datetime\n\n    import pandas as pd\n\n    from ._feature_manager import FeatureManager\n    from .block import RecordBlock\n    from .project import Project, RecordProject, RecordReference, Reference\n    from .query_manager import RelatedManager\n    from .query_set import SQLRecordList\n    from .schema import Schema\n\n\n# keep docstring in sync with test_record_docstring_examples in test_record_basics.py\nIMPORTS_UID = \"W3WdiFRZTvTJajNp\"\nSCHEMA_IMPORTS_UID = \"DGZkj4yhGWMJE5fu\"\n\n\nclass RecordBatch:\n    \"\"\"DataFrame-backed batch created by :meth:`Record.from_dataframe`.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        cls: type[Record],\n        df: pd.DataFrame,\n        resolved_type: Record,\n        name_field: str,\n    ) -> None:\n        self._cls = cls\n        self._df = df\n        self._resolved_type = resolved_type\n        self._name_field = name_field\n        self._records: list[Record] | None = None\n\n    def __len__(self) -> int:\n        return len(self._df)\n\n    @property\n    def type(self) -> Record:\n        return self._resolved_type\n\n    def _build_records(self) -> list[Record]:\n        import pandas as pd\n\n        records: list[Record] = []\n        row_dicts = self._df.to_dict(orient=\"records\")\n        for row in row_dicts:\n            if self._name_field in row:\n                name = row.pop(self._name_field)\n            elif \"name\" in row:\n                name = row.pop(\"name\")\n            else:\n                name = None\n            if pd.api.types.is_scalar(name) and pd.isna(name):\n                name = None\n\n            features: dict[str, Any] = {}\n            for key, value in row.items():\n                if pd.api.types.is_scalar(value) and pd.isna(value):\n                    continue\n                features[key] = value\n\n            record_kwargs: dict[str, Any] = {\"type\": self._resolved_type}\n            if features:\n                record_kwargs[\"features\"] = features\n            records.append(self._cls(name=name, **record_kwargs))\n        return records\n\n    def save(self) -> SQLRecordList[Record]:\n        \"\"\"Persist all records and their feature values.\"\"\"\n        from .query_set import SQLRecordList\n        from .save import save as ln_save\n\n        if self._records is None:\n            self._records = self._build_records()\n        ln_save(self._records)\n        return SQLRecordList(self._records)\n\n\nclass Record(SQLRecord, HasType, HasParents, CanCurate, TracksRun, TracksUpdates):\n    \"\"\"Flexible records with sheets & markdown pages.\n\n    Useful for managing samples, donors, cells, compounds, sequences, and other custom entities with their features.\n\n    If you just want a simple label, use :class:`~lamindb.ULabel`.\n\n    Args:\n        name: `str | None = None` A name.\n        description: `str | None = None` A description.\n        type: `Record | None = None` The type of this record.\n        is_type: `bool = False` Whether this record is a type (a record that\n            classifies other records).\n        features: `dict[str | Feature, Any] | None = None` Lazy feature values\n            to persist on `.save()` or `ln.save([...])`.\n        schema: `Schema | None = None` A schema defining allowed features for records of this type. Only applicable when `is_type=True`.\n        reference: `str | None = None` For instance, an external ID or a URL.\n        reference_type: `str | None = None` For instance, `\"url\"`.\n\n    See Also:\n        :class:`~lamindb.Feature`\n            Dimensions of measurement (e.g. column of a sheet, attribute of a record).\n        :class:`~lamindb.ULabel`\n            Like `Record`, just without the ability to store features.\n\n    Examples:\n\n        Create a **record** with a single feature::\n\n            # create a feature if you don't yet have one\n            gc_content = ln.Feature(name=\"gc_content\", dtype=float).save()\n\n            # create a record to track a sample\n            sample1 = ln.Record(name=\"Sample 1\", features={\"gc_content\": 0.5}).save()\n\n            # describe the record\n            sample1.describe()\n\n        Group several records under a **record type**, optionally constrained with a :class:`~lamindb.Schema`::\n\n            # create a flexible record type to track experiments\n            experiment_type = ln.Record(name=\"Experiment\", is_type=True).save()\n            experiment1 = ln.Record(name=\"Experiment 1\", type=experiment_type).save()\n\n            # create a feature to link experiments\n            experiment = ln.Feature(name=\"experiment\", dtype=experiment_type).save()\n\n            # create a record type to track samples -- constrain it with a schema\n            schema = ln.Schema([experiment, gc_content.with_config(optional=True)], name=\"sample_schema\").save()\n            sample_sheet = ln.Record(name=\"Sample Sheet\", is_type=True, schema=schema).save()\n\n            # group the sample1 record under the sample sheet\n            sample1.type = sample_sheet\n            sample1.save()\n\n            # reset the feature values for the record including the experiment\n            sample1.features.set_values({\n                \"gc_content\": 0.5,\n                \"experiment\": \"Experiment 1\",  # automatically resolves by name, also accepts the experiment1 object\n            })\n\n        Export all records under a type to a dataframe::\n\n            experiment_type.to_dataframe()\n            #> __lamindb_record_name__   ...\n            #>            Experiment 1   ...\n            #>            Experiment 2   ...\n\n        Import records from a dataframe :meth:`~lamindb.Record.from_dataframe`::\n\n            records = ln.Record.from_dataframe(df, type=\"my_df\").save()  # creates a type my_df with inferred schema\n\n        If you try to set incomplete features in a record in a sheet, you'll get a validation error::\n\n            sample2 = ln.Record(name=\"Sample 2\", type=sample_sheet).save()\n            sample2.features.set_values({\"gc_content\": 0.6})  # raises ValidationError because experiment is missing\n\n        Query records by features::\n\n            ln.Record.filter(gc_content=0.55)     # exact match\n            ln.Record.filter(gc_content__gt=0.5)  # greater than\n            ln.Record.filter(type=sample_sheet)   # just the record on the sheet\n\n        If your feature names are ambiguous, you can use a `Feature` object to disambiguate::\n\n            # to set feature values\n            sample1.features.set_values({gc_content: 0.5})  # gc_content is the feature object\n\n            # to query by feature values\n            ln.Record.filter(gc_content == 0.5)  # instead of gc_content=0.5\n\n        You can edit records like spreadsheets on the hub:\n\n        .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/XSzhWUb0EoHOejiw0001.png\n            :width: 800px\n\n        Just like for :class:`~lamindb.ULabel`, you can also model **ontologies** through the `parents`/`children` attributes.\n\n    .. dropdown:: What is the difference between `Record` and `SQLRecord`?\n\n        The features of a `Record` are flexible: you can dynamically define features and add features to a record.\n        The fields of a `SQLRecord` are static: you need to define them in code and then migrate the underlying database.\n\n        See :class:`~lamindb.models.SQLRecord` or the glossary for more information: :term:`docs:record`.\n\n    \"\"\"\n\n    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):\n        abstract = False\n        app_label = \"lamindb\"\n        if (\n            django_settings.DATABASES.get(\"default\", {}).get(\"ENGINE\")\n            == \"django.db.backends.postgresql\"\n        ):\n            triggers = [\n                pgtrigger.Trigger(\n                    name=\"prevent_record_type_cycle\",\n                    operation=pgtrigger.Update | pgtrigger.Insert,\n                    when=pgtrigger.Before,\n                    condition=pgtrigger.Condition(\"NEW.type_id IS NOT NULL\"),\n                    func=\"\"\"\n                        -- Check for direct self-reference\n                        IF NEW.type_id = NEW.id THEN\n                            RAISE EXCEPTION 'Cannot set type: record cannot be its own type';\n                        END IF;\n\n                        -- Check for cycles in the type chain\n                        IF EXISTS (\n                            WITH RECURSIVE type_chain AS (\n                                SELECT type_id, 1 as depth\n                                FROM lamindb_record\n                                WHERE id = NEW.type_id\n\n                                UNION ALL\n\n                                SELECT r.type_id, tc.depth + 1\n                                FROM lamindb_record r\n                                INNER JOIN type_chain tc ON r.id = tc.type_id\n                                WHERE tc.depth < 100\n                            )\n                            SELECT 1 FROM type_chain WHERE type_id = NEW.id\n                        ) THEN\n                            RAISE EXCEPTION 'Cannot set type: would create a cycle';\n                        END IF;\n\n                        RETURN NEW;\n                    \"\"\",\n                ),\n            ]\n        # also see raw SQL constraints for `is_type` and `type` FK validity in migrations\n\n    _name_field: str = \"name\"\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(\n        editable=False, unique=True, db_index=True, max_length=16, default=base62_16\n    )\n    \"\"\"A universal random id, valid across DB instances.\"\"\"\n    name: str = CharField(max_length=150, db_index=True, null=True)\n    \"\"\"Name or title of record (optional).\"\"\"\n    type: Record | None = ForeignKey(\"self\", PROTECT, null=True, related_name=\"records\")\n    \"\"\"Type of record, e.g., `Sample`, `Donor`, `Cell`, `Compound`, `Sequence` ← :attr:`~lamindb.Record.records`.\n\n    Allows to group records by type, e.g., all samples, all donors, all cells, all compounds, all sequences.\n    \"\"\"\n    records: RelatedManager[Record]\n    \"\"\"If a `type` (`is_type=True`), records of this `type`.\"\"\"\n    description: str | None = TextField(null=True)\n    \"\"\"A description.\"\"\"\n    reference: str | None = CharField(max_length=255, db_index=True, null=True)\n    \"\"\"A simple reference like a URL or external ID.\"\"\"\n    reference_type: str | None = CharField(max_length=25, db_index=True, null=True)\n    \"\"\"Type of simple reference.\"\"\"\n    extra_data: dict | None = models.JSONField(null=True)\n    \"\"\"Additional data in JSON format, not validated as features.\"\"\"\n    schema: Schema | None = ForeignKey(\n        \"Schema\", CASCADE, null=True, related_name=\"records\"\n    )\n    \"\"\"A schema to enforce for a type ← :attr:`~lamindb.Schema.records`.\n\n    This is analogous to the `schema` attribute of an `Artifact`.\n    If `is_type` is `True`, the schema is used to enforce features for each record of this type.\n    \"\"\"\n    linked_records: RelatedManager[Record] = models.ManyToManyField(\n        \"Record\",\n        through=\"RecordRecord\",\n        symmetrical=False,\n        related_name=\"linked_in_records\",\n    )\n    \"\"\"Records linked in this record as a value ← :attr:`~lamindb.Record.linked_in_records`.\"\"\"\n    linked_in_records: RelatedManager[Record]\n    \"\"\"Records linking this record as a value. Is reverse accessor for `linked_records`.\"\"\"\n    parents: RelatedManager[Record] = models.ManyToManyField(\n        \"self\", symmetrical=False, related_name=\"children\"\n    )\n    \"\"\"Ontological parents of this record ← :attr:`~lamindb.Record.children`.\n\n    You can build an ontology under a given `type`. For example, introduce a type `CellType` and model the hiearchy of cell types under it via `parents` and `children`.\n    \"\"\"\n    children: RelatedManager[Record]\n    \"\"\"Ontological children of this record. Is reverse accessor for `parents`.\"\"\"\n    # this is handled manually here because we want to se the related_name attribute\n    # (this doesn't happen via inheritance of TracksRun, everything else is the same)\n    run: Run | None = ForeignKey(\n        Run,\n        PROTECT,\n        related_name=\"output_records\",\n        null=True,\n        default=current_run,\n        editable=False,\n    )\n    \"\"\"Run that created the record ← :attr:`~lamindb.Run.output_records`.\"\"\"\n    input_of_runs: RelatedManager[Run] = models.ManyToManyField(\n        Run, related_name=\"input_records\"\n    )\n    \"\"\"Runs that use this record as an input ← :attr:`~lamindb.Run.input_records`.\"\"\"\n    artifacts: RelatedManager[Artifact] = models.ManyToManyField(\n        Artifact, through=\"ArtifactRecord\", related_name=\"records\"\n    )\n    \"\"\"Artifacts annotated by this record ← :attr:`~lamindb.Artifact.records`.\"\"\"\n    runs: RelatedManager[Run] = models.ManyToManyField(\n        Run, through=\"RunRecord\", related_name=\"records\"\n    )\n    \"\"\"Runs annotated by this record ← :attr:`~lamindb.Run.records`.\"\"\"\n    transforms: RelatedManager[Transform] = models.ManyToManyField(\n        Transform, through=\"TransformRecord\", related_name=\"records\"\n    )\n    \"\"\"Transforms annotated by this record ← :attr:`~lamindb.Transform.records`.\"\"\"\n    collections: RelatedManager[Collection] = models.ManyToManyField(\n        Collection, through=\"CollectionRecord\", related_name=\"records\"\n    )\n    \"\"\"Collections annotated by this record ← :attr:`~lamindb.Collection.records`.\"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"Projects that annotate this record ← :attr:`~lamindb.Project.records`.\"\"\"\n    references: RelatedManager[Reference]\n    \"\"\"References that annotate this record ← :attr:`~lamindb.Reference.records`.\"\"\"\n    linked_transforms: RelatedManager[Transform]\n    \"\"\"Transforms linked in this record as values ← :attr:`~lamindb.Transform.linked_in_records`.\"\"\"\n    linked_runs: RelatedManager[Run]\n    \"\"\"Runs linked in this record as values ← :attr:`~lamindb.Run.linked_in_records`.\"\"\"\n    linked_ulabels: RelatedManager[ULabel]\n    \"\"\"ULabels linked in this record as values ← :attr:`~lamindb.ULabel.linked_in_records`.\"\"\"\n    linked_artifacts: RelatedManager[Artifact]\n    \"\"\"Artifacts linked in this record as values ← :attr:`~lamindb.Artifact.linked_in_records`.\"\"\"\n    linked_projects: RelatedManager[Project]\n    \"\"\"Projects linked in this record as values ← :attr:`~lamindb.Project.linked_in_records`.\"\"\"\n    linked_references: RelatedManager[Reference]\n    \"\"\"References linked in this record as values ← :attr:`~lamindb.Reference.linked_in_records`.\"\"\"\n    linked_collections: RelatedManager[Collection]\n    \"\"\"Collections linked in this record as values ← :attr:`~lamindb.Collection.linked_in_records`.\"\"\"\n    linked_users: RelatedManager[User]\n    \"\"\"Users linked in this record as values ← :attr:`~lamindb.User.linked_in_records`.\"\"\"\n    ablocks: RelatedManager[RecordBlock]\n    \"\"\"Attached blocks ← :attr:`~lamindb.RecordBlock.record`.\"\"\"\n    values_json: RelatedManager[RecordJson]\n    \"\"\"JSON values `(record_id, feature_id, value)`.\"\"\"\n    values_record: RelatedManager[RecordRecord]\n    \"\"\"Record values with their features `(record_id, feature_id, value_id)`.\"\"\"\n    values_ulabel: RelatedManager[RecordULabel]\n    \"\"\"ULabel values with their features `(record_id, feature_id, value_id)`.\"\"\"\n    values_user: RelatedManager[RecordUser]\n    \"\"\"User values with their features `(record_id, feature_id, value_id)`.\"\"\"\n    values_transform: RelatedManager[RecordTransform]\n    \"\"\"Transform values with their features `(record_id, feature_id, value_id)`.\"\"\"\n    values_run: RelatedManager[RecordRun]\n    \"\"\"Run values with their features `(record_id, feature_id, value_id)`.\"\"\"\n    values_artifact: RelatedManager[RecordArtifact]\n    \"\"\"Artifact values with their features `(record_id, feature_id, value_id)`.\"\"\"\n    values_collection: RelatedManager[RecordCollection]\n    \"\"\"Collection values with their features `(record_id, feature_id, value_id)`.\"\"\"\n    values_reference: RelatedManager[RecordReference]\n    \"\"\"Reference values with their features `(record_id, feature_id, value_id)`.\"\"\"\n    values_project: RelatedManager[RecordProject]\n    \"\"\"Project values with their features `(record_id, feature_id, value_id)`.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        name: str | None = None,\n        type: Record | None = None,\n        is_type: bool = False,\n        features: dict[str | Feature, Any] | None = None,\n        description: str | None = None,\n        schema: Schema | None = None,\n        reference: str | None = None,\n        reference_type: str | None = None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        if len(args) == len(self._meta.concrete_fields):\n            super().__init__(*args, **kwargs)\n            return None\n        if len(args) > 0:\n            raise ValueError(\"Only one non-keyword arg allowed\")\n        name: str = kwargs.pop(\"name\", None)\n        type: str | None = kwargs.pop(\"type\", None)\n        is_type: bool = kwargs.pop(\"is_type\", False)\n        features: dict[str | Feature, Any] | None = kwargs.pop(\"features\", None)\n        description: str | None = kwargs.pop(\"description\", None)\n        schema: Schema | None = kwargs.pop(\"schema\", None)\n        reference: str | None = kwargs.pop(\"reference\", None)\n        reference_type: str | None = kwargs.pop(\"reference_type\", None)\n        branch = kwargs.pop(\"branch\", None)\n        branch_id = kwargs.pop(\"branch_id\", 1)\n        space = kwargs.pop(\"space\", None)\n        space_id = kwargs.pop(\"space_id\", 1)\n        _skip_validation = kwargs.pop(\"_skip_validation\", False)\n        _aux = kwargs.pop(\"_aux\", None)\n        if len(kwargs) > 0:\n            valid_keywords = \", \".join([val[0] for val in _get_record_kwargs(Record)])\n            raise FieldValidationError(\n                f\"Only {valid_keywords} are valid keyword arguments\"\n            )\n        if schema and not is_type:\n            logger.important(\"passing schema, treating as type\")\n            is_type = True\n        if features is not None:\n            self._features = features\n        super().__init__(\n            name=name,\n            type=type,\n            is_type=is_type,\n            description=description,\n            reference=reference,\n            reference_type=reference_type,\n            schema=schema,\n            branch=branch,\n            branch_id=branch_id,\n            space=space,\n            space_id=space_id,\n            _skip_validation=_skip_validation,\n            _aux=_aux,\n        )\n\n    def save(self, *args, **kwargs) -> Record:\n        super().save(*args, **kwargs)\n        if hasattr(self, \"_features\"):\n            pending_features = self._features\n            self.features.add_values(pending_features)\n            del self._features\n        return self\n\n    @strict_classmethod\n    def from_dataframe(\n        cls,\n        df: pd.DataFrame,\n        *,\n        type: Record | str,\n        name_field: str = \"__lamindb_record_name__\",\n    ) -> RecordBatch:\n        \"\"\"Construct a dataframe-backed batch of records for bulk saving.\n\n        Returns a :class:`RecordBatch`. Follow with `records.save()`.\n\n        Args:\n            df: A dataframe where rows represent records.\n            type: Record type for all rows as either a `Record` object or a\n                string. If passing a string, a new type with that name is created\n                under `Imports` with an inferred schema from the dataframe.\n                If that type name already exists, raise an error and pass an\n                existing `Record` object for reuse.\n                If the resolved type is a sheet (`type.schema is not None`), feature\n                values are validated against that schema at save time.\n            name_field: Column used for record names. Falls back to `name` if\n                absent. If neither exists, records are created without names.\n\n        Examples:\n\n            Create a new type and import records::\n\n                records = ln.Record.from_dataframe(df, type=\"my_df\").save()\n\n            Import records into an existing type::\n\n                records = ln.Record.from_dataframe(df, type=sample_sheet).save()\n\n        \"\"\"\n        import pandas as pd\n\n        from .schema import Schema\n\n        if not isinstance(df, pd.DataFrame):\n            raise TypeError(\"`df` needs to be a pandas DataFrame.\")\n        resolved_type: Record\n        if isinstance(type, str):\n            imports_type = cls.filter(uid=IMPORTS_UID).one_or_none()\n            if imports_type is None:\n                imports_type = cls(name=\"Imports\", is_type=True)\n                imports_type.uid = IMPORTS_UID\n                imports_type = imports_type.save()\n            existing_type = cls.filter(\n                name=type, is_type=True, type=imports_type\n            ).one_or_none()\n            if existing_type is not None:\n                raise ValueError(\n                    f\"type '{type}' already exists under 'Imports', please pass it as a Record object to reuse.\"\n                )\n            imports_schema = Schema.filter(uid=SCHEMA_IMPORTS_UID).one_or_none()\n            if imports_schema is None:\n                imports_schema = Schema(name=\"Imports\", is_type=True)\n                imports_schema.uid = SCHEMA_IMPORTS_UID\n                imports_schema = imports_schema.save()\n            inferred_schema = Schema.from_dataframe(df, name=type)\n            if inferred_schema is None:\n                raise ValueError(\n                    \"Could not infer a schema from dataframe columns. \"\n                    \"Ensure dataframe columns map to existing Features, or pass an existing Record type object.\"\n                )\n            inferred_schema.type = imports_schema\n            inferred_schema = inferred_schema.save()\n            resolved_type = cls(\n                name=type,\n                is_type=True,\n                type=imports_type,\n                schema=inferred_schema,\n            ).save()\n        else:\n            resolved_type = type\n        if not resolved_type.is_type:\n            raise ValueError(\"`type` needs to be a record type (`is_type=True`).\")\n        if resolved_type.name is None:\n            raise ValueError(\"`type` needs to have a non-null `name`.\")\n\n        return RecordBatch(\n            cls=cls,\n            df=df,\n            resolved_type=resolved_type,\n            name_field=name_field,\n        )\n\n    @property\n    def features(self) -> FeatureManager:\n        \"\"\"Manage the linked feature values.\n\n        For examples, see :class:`~lamindb.Record` or :class:`~lamindb.models.FeatureManager`.\n        \"\"\"\n        from ._feature_manager import FeatureManager\n\n        return FeatureManager(self)\n\n    @property\n    def is_sheet(self) -> bool:\n        \"\"\"Check if record is a `sheet`, i.e., `self.is_type and self.schema is not None`.\"\"\"\n        return self.schema is not None and self.is_type\n\n    def query_parents(self) -> QuerySet:\n        \"\"\"Query all parents of a record recursively.\n\n        While `.parents` retrieves the direct parents, this method\n        retrieves all ancestors of the current record.\n        \"\"\"\n        return _query_relatives([self], \"parents\")  # type: ignore\n\n    def query_children(self) -> QuerySet:\n        \"\"\"Query all children of a record recursively.\n\n        While `.children` retrieves the direct children, this method\n        retrieves all descendants of a parent.\n        \"\"\"\n        return _query_relatives([self], \"children\")  # type: ignore\n\n    def query_records(self) -> QuerySet:\n        \"\"\"Query records of sub types.\n\n        While `.records` retrieves the records with the current type, this method\n        also retrieves sub types and the records with sub types of the current type.\n        \"\"\"\n        return _query_relatives([self], \"records\")  # type: ignore\n\n    def _set_export_run(self, is_run_input: bool | Run | None = None) -> None:\n        from lamindb.core._context import context\n        from lamindb.models import Run, Transform\n\n        if isinstance(is_run_input, Run):\n            run = is_run_input\n        elif is_run_input in {True, None}:\n            if context.run is None:\n                transform, _ = Transform.objects.get_or_create(\n                    key=\"__lamindb_record_export__\", kind=\"function\"\n                )\n                run = Run(transform).save()\n            else:\n                run = context.run\n        else:\n            run = None\n        self._export_run = run\n\n    @class_and_instance_method\n    def to_dataframe(\n        cls_or_self,\n        recurse: bool = False,\n        is_run_input: bool | Run | None = None,\n        **kwargs,\n    ) -> pd.DataFrame:\n        \"\"\"Export to a pandas DataFrame.\n\n        This is roughly equivalent to::\n\n            ln.Record.filter(type=sample_type).to_dataframe(include=\"features\")\n\n        `to_dataframe()` ensures that the columns are ordered according to the schema of the type and encodes fields like `uid` and `name`.\n\n        It will also track the record as an input to the current run.\n\n        Args:\n            recurse: Whether to include records of sub-types recursively.\n            is_run_input: Whether to track the record as a run input.\n            **kwargs: Keyword arguments passed to :meth:`~lamindb.models.QuerySet.to_dataframe`.\n        \"\"\"\n        import pandas as pd\n\n        if isinstance(cls_or_self, type):\n            return type(cls_or_self).to_dataframe(cls_or_self, **kwargs)  # type: ignore\n        if not cls_or_self.is_type:\n            raise TypeError(\n                \"to_dataframe() can only be called on the class or on record type instance.\"\n            )\n        self = cls_or_self\n        assert self.is_type, \"Only types can be exported as dataframes\"  # noqa: S101\n\n        branch_ids = get_default_branch_ids()\n        qs = (\n            self.query_records()\n            if recurse\n            else self.records.filter(branch_id__in=branch_ids)\n        )\n        logger.important(f\"exporting {qs.count()} records of '{self.name}'\")\n        if \"order_by\" not in kwargs:\n            kwargs[\"order_by\"] = \"id\"\n        df = qs.to_dataframe(features=\"queryset\", limit=None, **kwargs)\n        encoded_id = encode_lamindb_fields_as_columns(self.__class__, \"id\")\n        encoded_uid = encode_lamindb_fields_as_columns(self.__class__, \"uid\")\n        encoded_name = encode_lamindb_fields_as_columns(self.__class__, \"name\")\n        # encode the django id, uid and name fields\n        if df.index.name == \"id\":\n            df.index.name = encoded_id\n        if \"uid\" in df.columns and encoded_uid not in df.columns:\n            df = df.rename(columns={\"uid\": encoded_uid})\n        if \"name\" in df.columns and encoded_name not in df.columns:\n            df = df.rename(columns={\"name\": encoded_name})\n        if self.schema is not None:\n            all_features = self.schema.members.all()\n            desired_order = all_features.to_list(\"name\")  # only members is ordered!\n            for feature in all_features:\n                if feature.name not in df.columns:\n                    df[feature.name] = pd.Series(\n                        dtype=convert_to_pandas_dtype(feature._dtype_str)\n                    )\n        else:\n            # sort alphabetically for now\n            desired_order = df.columns[2:].tolist()\n            desired_order.sort()\n        df = reorder_subset_columns_in_df(df, desired_order, position=0)  # type: ignore\n        self._set_export_run(is_run_input=is_run_input)\n        self._export_run.input_records.add(self)\n        return df.sort_index()  # order by id\n\n    def to_artifact(\n        self,\n        key: str | None = None,\n        suffix: str | None = None,\n        is_run_input: bool | Run | None = None,\n        **kwargs,\n    ) -> Artifact:\n        \"\"\"Calls `to_dataframe()` to create an artifact.\n\n        The format defaults to `.csv` unless the key specifies another format or suffix is passed.\n\n        The `key` defaults to `sheet_exports/{self.name}{suffix}` unless a `key` is passed.\n\n        Args:\n            key: `str | None = None` The artifact key.\n            suffix: `str | None = None` The suffix to append to the default key if no key is passed.\n            is_run_input: Whether to track the record as a run input.\n            **kwargs: Keyword arguments passed to :meth:`~lamindb.models.Record.to_dataframe`.\n        \"\"\"\n        assert self.is_type, \"Only types can be exported as artifacts.\"\n        assert key is None or suffix is None, \"Only one of key or suffix can be passed.\"\n        if key is None:\n            suffix = \".csv\" if suffix is None else suffix\n            key = f\"sheet_exports/{self.name}{suffix}\"\n        description = f\": {self.description}\" if self.description is not None else \"\"\n        return Artifact.from_dataframe(\n            self.to_dataframe(is_run_input=is_run_input, **kwargs),\n            key=key,\n            description=f\"Export of sheet {self.uid}{description}\",\n            schema=self.schema,\n            csv_kwargs={\"index\": False},\n            run=self._export_run,\n        ).save()\n\n\n# for storing JSON values in records\nclass RecordJson(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"values_json\")\n    feature: Feature = ForeignKey(Feature, PROTECT, related_name=\"links_recordjson\")\n    value: Any = JSONField(default=None, db_default=None)\n\n    class Meta:\n        app_label = \"lamindb\"\n        # a list is modeled as a list in json, hence no multi-value association for the same feature unlike for\n        # categorical/relational values\n        unique_together = (\"record\", \"feature\")\n\n\n# for storing record-like values in records\nclass RecordRecord(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"values_record\")\n    feature: Feature = ForeignKey(Feature, PROTECT, related_name=\"links_recordrecord\")\n    value: Record = ForeignKey(Record, PROTECT, related_name=\"links_record\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"record\", \"feature\", \"value\")\n\n\n# for storing ulabel-like values in records\nclass RecordULabel(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"values_ulabel\")\n    feature: Feature = ForeignKey(Feature, PROTECT, related_name=\"links_recordulabel\")\n    value: ULabel = ForeignKey(ULabel, PROTECT, related_name=\"links_record\")\n\n    class Meta:\n        # allows linking exactly one record to one ulabel per feature, because we likely don't want to have Many\n        app_label = \"lamindb\"\n        unique_together = (\"record\", \"feature\", \"value\")\n\n\n# for storing user-like values in records\nclass RecordUser(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"values_user\")\n    feature: Feature = ForeignKey(Feature, PROTECT, related_name=\"links_recorduser\")\n    value: User = ForeignKey(User, PROTECT, related_name=\"links_record\")\n\n    class Meta:\n        # allows linking exactly one record to one user per feature, because we likely don't want to have Many\n        app_label = \"lamindb\"\n        unique_together = (\"record\", \"feature\", \"value\")\n\n\n# for storing run-like values in records\nclass RecordRun(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"values_run\")\n    feature: Feature = ForeignKey(Feature, PROTECT, related_name=\"links_recordrun\")\n    value: Run = ForeignKey(Run, PROTECT, related_name=\"links_in_record\")\n\n    class Meta:\n        # allows linking several records to a single run for the same feature because we'll likely need this\n        app_label = \"lamindb\"\n        unique_together = (\"record\", \"feature\", \"value\")\n\n\n# for annotating runs with records\nclass RunRecord(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    run: Run = ForeignKey(Run, CASCADE, related_name=\"links_record\")\n    record: Record = ForeignKey(Record, PROTECT, related_name=\"links_run\")\n    feature: Feature = ForeignKey(\n        Feature, PROTECT, null=True, related_name=\"links_runrecord\"\n    )\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    created_by: User = ForeignKey(\n        \"lamindb.User\", PROTECT, default=current_user_id, related_name=\"+\"\n    )\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"run\", \"record\", \"feature\")\n\n\n# for storing artifact-like values in records\nclass RecordArtifact(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"values_artifact\")\n    feature: Feature = ForeignKey(Feature, PROTECT, related_name=\"links_recordartifact\")\n    value: Artifact = ForeignKey(Artifact, PROTECT, related_name=\"links_in_record\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"record\", \"feature\", \"value\")\n\n\n# for annotating artifacts with records\nclass ArtifactRecord(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name=\"links_record\")\n    record: Record = ForeignKey(Record, PROTECT, related_name=\"links_artifact\")\n    feature: Feature = ForeignKey(\n        Feature, PROTECT, null=True, related_name=\"links_artifactrecord\"\n    )\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"artifact\", \"record\", \"feature\")\n\n\n# for storing collection-like values in records\nclass RecordCollection(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"values_collection\")\n    feature: Feature = ForeignKey(\n        Feature, PROTECT, related_name=\"links_recordcollection\"\n    )\n    value: Collection = ForeignKey(Collection, PROTECT, related_name=\"links_in_record\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"record\", \"feature\", \"value\")\n\n\n# for annotating collections with records\nclass CollectionRecord(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    collection: Collection = ForeignKey(\n        Collection, CASCADE, related_name=\"links_record\"\n    )\n    record: Record = ForeignKey(Record, PROTECT, related_name=\"links_collection\")\n    feature: Feature = ForeignKey(\n        Feature, PROTECT, null=True, related_name=\"links_collectionrecord\"\n    )\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"collection\", \"record\", \"feature\")\n\n\n# for storing transform-like values in records\nclass RecordTransform(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    record: Record = ForeignKey(Record, CASCADE, related_name=\"values_transform\")\n    feature: Feature = ForeignKey(\n        Feature, PROTECT, related_name=\"links_recordtransform\"\n    )\n    value: Transform = ForeignKey(Transform, PROTECT, related_name=\"links_in_record\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"record\", \"feature\", \"value\")\n\n\n# for annotating transforms with records\nclass TransformRecord(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    transform: Transform = ForeignKey(Transform, CASCADE, related_name=\"links_record\")\n    record: Record = ForeignKey(Record, PROTECT, related_name=\"links_transform\")\n    feature: Feature = ForeignKey(\n        Feature, PROTECT, null=True, related_name=\"links_transformrecord\"\n    )\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now()\n    )\n    created_by: User = ForeignKey(\n        \"lamindb.User\", PROTECT, default=current_user_id, related_name=\"+\"\n    )\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"transform\", \"record\", \"feature\")\n"
  },
  {
    "path": "lamindb/models/run.py",
    "content": "from __future__ import annotations\n\nimport os\nimport subprocess\nimport sys\nfrom typing import TYPE_CHECKING, overload\n\nfrom django.db import models\nfrom django.db.models import (\n    CASCADE,\n    PROTECT,\n    Q,\n)\nfrom lamin_utils import logger\nfrom lamindb_setup import _check_instance_setup\nfrom lamindb_setup import settings as setup_settings\n\nfrom lamindb.base.fields import (\n    BooleanField,\n    CharField,\n    DateTimeField,\n    ForeignKey,\n    TextField,\n)\nfrom lamindb.base.users import current_user_id\nfrom lamindb.base.utils import strict_classmethod\n\nfrom ..base.types import RUN_CODE_TO_STATUS\nfrom ..base.uids import base62_16\nfrom .can_curate import CanCurate\nfrom .query_set import BasicQuerySet, QuerySet\nfrom .sqlrecord import BaseSQLRecord, IsLink, SQLRecord\n\nif TYPE_CHECKING:\n    from datetime import datetime\n\n    from lamindb.base.types import RunStatus\n\n    from ._feature_manager import FeatureManager\n    from .artifact import Artifact\n    from .block import RunBlock\n    from .collection import Collection\n    from .feature import Feature, JsonValue\n    from .project import Project\n    from .query_manager import RelatedManager\n    from .record import Record\n    from .transform import Transform\n    from .ulabel import ULabel\n\n\n_TRACKING_READY: bool | None = None\n\n\ndef current_run() -> Run | None:\n    global _TRACKING_READY\n\n    if not _TRACKING_READY:\n        _TRACKING_READY = _check_instance_setup()\n    if _TRACKING_READY:\n        import lamindb\n\n        # also see get_run() in core._data\n        run = lamindb.core._functions.get_current_tracked_run()\n        if run is None:\n            run = lamindb.context.run\n        return run\n    else:\n        return None\n\n\nclass TracksRun(models.Model):\n    \"\"\"Base class tracking latest run, creating user, and `created_at` timestamp.\"\"\"\n\n    class Meta:\n        abstract = True\n\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of creation of record.\"\"\"\n    created_by: User = ForeignKey(\n        \"lamindb.User\",\n        PROTECT,\n        editable=False,\n        default=current_user_id,\n        related_name=\"+\",\n    )\n    \"\"\"Creator of record.\"\"\"\n    run: Run | None = ForeignKey(\n        \"lamindb.Run\", PROTECT, null=True, default=current_run, related_name=\"+\"\n    )\n    \"\"\"Run that created record.\"\"\"\n\n\nclass TracksUpdates(models.Model):\n    \"\"\"Base class tracking previous runs and `updated_at` timestamp.\"\"\"\n\n    class Meta:\n        abstract = True\n\n    updated_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of last update to record.\"\"\"\n\n\nclass User(BaseSQLRecord, CanCurate):\n    \"\"\"Users.\n\n    Every :class:`~lamindb.models.SQLRecord` has a `created_by` field that links to the creating user.\n\n    This registry is automatically populated with user identities from LaminHub in case the user authenticates.\n\n    Examples:\n\n        Query a user by handle::\n\n            user = ln.User.get(handle=\"testuser1\")\n    \"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    _name_field: str = \"handle\"\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(editable=False, unique=True, db_index=True, max_length=8)\n    \"\"\"Universal id, valid across DB instances.\"\"\"\n    handle: str = CharField(max_length=30, unique=True, db_index=True)\n    \"\"\"User handle, valid across DB instances (required).\"\"\"\n    name: str | None = CharField(max_length=150, db_index=True, null=True)\n    \"\"\"Full name (optional).\"\"\"  # has to match hub specification, where it's also optional\n    linked_in_records: RelatedManager[Record] = models.ManyToManyField(\n        \"Record\", through=\"RecordUser\", related_name=\"linked_users\"\n    )\n    \"\"\"This user is linked in these records as a value.\"\"\"\n    artifacts: RelatedManager[Artifact] = models.ManyToManyField(\n        \"Artifact\",\n        through=\"ArtifactUser\",\n        through_fields=(\"user\", \"artifact\"),\n        related_name=\"users\",\n    )\n    \"\"\"Artifacts annotated with this user.\"\"\"\n    created_artifacts: RelatedManager[Artifact]\n    \"\"\"Artifacts created by user.\"\"\"\n    created_transforms: RelatedManager[Transform]\n    \"\"\"Transforms created by user.\"\"\"\n    created_runs: RelatedManager[Run]\n    \"\"\"Runs created by user.\"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"Projects this user is linked to (e.g. as member) ← :attr:`~lamindb.ProjectUser.project`.\"\"\"\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of creation of object.\"\"\"\n    updated_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of last update to object.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        uid: str,\n        handle: str,\n        name: str | None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        super().__init__(*args, **kwargs)\n\n\nclass Run(SQLRecord, TracksUpdates):\n    \"\"\"Runs of transforms such as the executions of a script.\n\n    Args:\n        transform: :class:`~lamindb.Transform` A data transformation object.\n        name: `str | None = None` A name.\n        params: `dict | None = None` A dictionary of parameters.\n        reference: `str | None = None` For instance, an external ID or URL.\n        reference_type: `str | None = None` For instance, `redun_id`, `nextflow_id` or `url`.\n        initiated_by_run: `Run | None = None` The `run` that triggers this `run`.\n\n    See Also:\n        :func:`~lamindb.track`\n            Globally track a script or notebook run.\n        :func:`~lamindb.step`\n            Track a function executionwith this decorator.\n\n    Examples:\n\n        Create a run record::\n\n            ln.Transform(key=\"Cell Ranger\", version=\"7.2.0\", kind=\"pipeline\").save()\n            transform = ln.Transform.get(key=\"Cell Ranger\", version=\"7.2.0\")\n            run = ln.Run(transform)\n\n        Track a global run of a notebook or script::\n\n            ln.track()\n            ln.context.run  # global run object\n\n        You can pass parameters to `Run(transform, params=params)` or add them later::\n\n            run.params = {\n                \"learning_rate\": 0.01,\n                \"input_dir\": \"s3://my-bucket/mydataset\",\n                \"downsample\": True,\n                \"preprocess_params\": {\n                    \"normalization_type\": \"cool\",\n                    \"subset_highlyvariable\": True,\n                },\n            }\n            run.save()\n\n        In contrast to `.params`, features are indexed in the `Feature` registry and can reference relational categorical values.\n        If you want to link feature values, use::\n\n            run.features.set_values({\n                \"experiment\": \"My experiment 1\",\n            })\n\n        Guide: :ref:`track-run-parameters`\n    \"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n\n    _name_field: str = \"started_at\"\n\n    id: int = models.BigAutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    # default uid was changed from base62_20 to base62_16 in 1.6.0\n    uid: str = CharField(\n        editable=False, unique=True, db_index=True, max_length=20, default=base62_16\n    )\n    \"\"\"Universal id, valid across DB instances.\"\"\"\n    name: str | None = CharField(max_length=150, null=True, db_index=True)\n    \"\"\"An optional name for this run.\"\"\"\n    description: str | None = TextField(null=True)\n    \"\"\"An optional description for this run.\"\"\"\n    transform: Transform = ForeignKey(\"Transform\", CASCADE, related_name=\"runs\")\n    \"\"\"The transform that is being run ← :attr:`~lamindb.Transform.runs`.\"\"\"\n    entrypoint: str | None = CharField(max_length=255, null=True, db_index=True)\n    \"\"\"The entrypoint of the transform.\n\n    This could be a function name or the entry point of a CLI or workflow manager.\n    \"\"\"\n    started_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"The time this run started.\"\"\"\n    finished_at: datetime | None = DateTimeField(db_index=True, null=True, default=None)\n    \"\"\"The time this run finished or aborted.\"\"\"\n    # we don't want to make below a OneToOne because there could be the same trivial report\n    # generated for many different runs\n    report: Artifact | None = ForeignKey(\n        \"Artifact\", PROTECT, null=True, related_name=\"_report_of\", default=None\n    )\n    \"\"\"The report of this run such as an `.html` or `.txt` file.\"\"\"\n    environment: Artifact | None = ForeignKey(\n        \"Artifact\", PROTECT, null=True, related_name=\"_environment_of\", default=None\n    )\n    \"\"\"The computational environment for this run.\n\n    For instance, `Dockerfile`, `docker image`, `requirements.txt`, `environment.yml`, etc.\n    \"\"\"\n    plan: Artifact | None = ForeignKey(\n        \"Artifact\", PROTECT, null=True, related_name=\"_plan_for_runs\", default=None\n    )\n    \"\"\"The (agent) plan for this run.\n\n    Also see: :attr:`~lamindb.Run.initiated_by_run`.\n    \"\"\"\n    input_records: RelatedManager[Record]\n    \"\"\"The collections serving as input for this run ← :attr:`~lamindb.Record.input_of_runs`.\"\"\"\n    output_records: RelatedManager[Record]\n    \"\"\"The collections created in this run ← :attr:`~lamindb.Record.run`.\"\"\"\n    input_artifacts: RelatedManager[Artifact]\n    \"\"\"The artifacts serving as input for this run ← :attr:`~lamindb.Artifact.input_of_runs`.\n    \"\"\"\n    output_artifacts: RelatedManager[Artifact]\n    \"\"\"The artifacts created in this run ← :attr:`~lamindb.Artifact.run`.\n\n    This does **not** include recreated artifacts, which are tracked via :attr:`~lamindb.Run.recreated_artifacts`.\n\n    If you want to query created + recreated artifacts, use :meth:`~lamindb.Run.query_output_artifacts` instead.\n    \"\"\"\n    recreated_artifacts: RelatedManager[Artifact]\n    \"\"\"The output artifacts that were recreated by this run ← :attr:`~lamindb.Artifact.recreating_runs`.\n\n    Artifacts are *recreated* if they trigger a hash lookup match for an existing artifact.\n    \"\"\"\n    input_collections: RelatedManager[Collection]\n    \"\"\"The collections serving as input for this run ← :attr:`~lamindb.Collection.input_of_runs`.\"\"\"\n    output_collections: RelatedManager[Collection]\n    \"\"\"The collections created in this run ← :attr:`~lamindb.Collection.run`.\"\"\"\n    recreated_collections: RelatedManager[Collection]\n    \"\"\"The output collections that were recreated by this run ← :attr:`~lamindb.Collection.recreating_runs`.\n\n    Collections are *recreated* if they trigger a hash lookup match for an existing collection.\n    \"\"\"\n    params: dict = models.JSONField(null=True)\n    \"\"\"Parameters (plain JSON values).\"\"\"\n    json_values: RelatedManager[JsonValue] = models.ManyToManyField(\n        \"JsonValue\", through=\"RunJsonValue\", related_name=\"runs\"\n    )\n    \"\"\"Feature-indexed JSON values ← :attr:`~lamindb.JsonValue.runs`.\"\"\"\n    reference: str | None = CharField(max_length=255, db_index=True, null=True)\n    \"\"\"A reference like a URL or an external ID such as from a workflow manager.\"\"\"\n    reference_type: str | None = CharField(max_length=25, db_index=True, null=True)\n    \"\"\"The type of the `reference` such as a workflow manager execution ID.\"\"\"\n    cli_args: str | None = CharField(max_length=1024, null=True, default=None)\n    \"\"\"CLI arguments if the run was invoked from the command line.\"\"\"\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"The time of creation of this run.\"\"\"\n    created_by: User = ForeignKey(\n        \"User\", CASCADE, default=current_user_id, related_name=\"created_runs\"\n    )\n    \"\"\"The creator of this run ← :attr:`~lamindb.User.created_runs`.\"\"\"\n    ulabels: RelatedManager[ULabel] = models.ManyToManyField(\n        \"ULabel\", through=\"RunULabel\", related_name=\"runs\"\n    )\n    \"\"\"The ulabels annotating this run ← :attr:`~lamindb.ULabel.runs`.\"\"\"\n    initiated_by_run: Run | None = ForeignKey(\n        \"Run\", CASCADE, null=True, related_name=\"initiated_runs\", default=None\n    )\n    \"\"\"The run that initiated this run ← :attr:`~lamindb.Run.initiated_runs`.\"\"\"\n    initiated_runs: RelatedManager[Run]\n    \"\"\"The runs that were initiated by this run.\"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"The projects annotating this run ← :attr:`~lamindb.Project.runs`.\"\"\"\n    ablocks: RelatedManager[RunBlock]\n    \"\"\"Attached blocks ← :attr:`~lamindb.RunBlock.run`.\"\"\"\n    records: RelatedManager[Record]\n    \"\"\"The records annotating this run ← :attr:`~lamindb.Record.runs`.\"\"\"\n    linked_in_records: RelatedManager[Record] = models.ManyToManyField(\n        \"Record\", through=\"RecordRun\", related_name=\"linked_runs\"\n    )\n    \"\"\"This run is linked in these records as a value ← :attr:`~lamindb.Record.linked_runs`.\"\"\"\n    artifacts: RelatedManager[Artifact] = models.ManyToManyField(\n        \"Artifact\", through=\"ArtifactRun\", related_name=\"runs\"\n    )\n    \"\"\"The artifacts annotated by this run ← :attr:`~lamindb.Artifact.runs`.\"\"\"\n    linked_artifacts: RelatedManager[Artifact] = models.ManyToManyField(\n        \"Artifact\",\n        through=\"RunArtifact\",\n        related_name=\"linked_by_runs\",\n    )\n    \"\"\"The artifacts linked by this run through the run's features ← :attr:`~lamindb.RunArtifact.artifact`.\"\"\"\n    _is_consecutive: bool | None = BooleanField(null=True)\n    \"\"\"Indicates whether code was consecutively executed. Is relevant for notebooks.\"\"\"\n    _status_code: int = models.SmallIntegerField(\n        default=-3,\n        db_default=-3,\n        db_index=True,\n    )\n    \"\"\"Status code of the run. See the status property for mapping to string.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        transform: Transform,\n        name: str | None = None,\n        description: str | None = None,\n        entrypoint: str | None = None,\n        params: dict | None = None,\n        reference: str | None = None,\n        reference_type: str | None = None,\n        initiated_by_run: Run | None = None,\n        plan: Artifact | None = None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        if len(args) == len(self._meta.concrete_fields):\n            super().__init__(*args, **kwargs)\n            return None\n        # now we proceed with the user-facing constructor\n        if len(args) > 1:\n            raise ValueError(\"Only one non-keyword arg allowed: transform\")\n        transform: Transform = None\n        if \"transform\" in kwargs or len(args) == 1:\n            transform = kwargs.pop(\"transform\") if len(args) == 0 else args[0]\n        name: str | None = kwargs.pop(\"name\", None)\n        description: str | None = kwargs.pop(\"description\", None)\n        entrypoint: str | None = kwargs.pop(\"entrypoint\", None)\n        params: dict | None = kwargs.pop(\"params\", None)\n        reference: str | None = kwargs.pop(\"reference\", None)\n        reference_type: str | None = kwargs.pop(\"reference_type\", None)\n        initiated_by_run: Run | None = kwargs.pop(\"initiated_by_run\", None)\n        report: Artifact | None = kwargs.pop(\"report\", None)\n        plan: Artifact | None = kwargs.pop(\"plan\", None)\n        if transform is None:\n            raise TypeError(\"Pass transform parameter\")\n        if transform._state.adding:\n            raise ValueError(\"Please save transform record before creating a run\")\n        if not len(kwargs) == 0:\n            raise ValueError(\n                f\"Only transform, name, description, params, reference, reference_type, initiated_by_run, plan can be passed, but you passed: {kwargs}\"\n            )\n        super().__init__(  # type: ignore\n            transform=transform,\n            name=name,\n            description=description,\n            entrypoint=entrypoint,\n            params=params,\n            reference=reference,\n            reference_type=reference_type,\n            initiated_by_run=initiated_by_run,\n            report=report,\n            plan=plan,\n        )\n\n    @property\n    def status(self) -> RunStatus:\n        \"\"\"Run status.\n\n        Get the status of the run:\n\n        ===========  =====  ===========================\n        status       code   description\n        ===========  =====  ===========================\n        `scheduled`  -3     The run is scheduled.\n        `restarted`  -2     The run was restarted.\n        `started`    -1     The run has started.\n        `completed`  0      The run completed successfully.\n        `errored`    1      The run ended with an error.\n        `aborted`    2      The run was aborted.\n        ===========  =====  ===========================\n\n        The database stores the run status as an integer code in field `_status_code`.\n\n        Example:\n\n            See the status of a run::\n\n                run.status\n                #> 'completed'\n\n            Query by status::\n\n                ln.Run.filter(status=\"completed\").to_dataframe()\n\n        \"\"\"\n        return RUN_CODE_TO_STATUS[self._status_code]\n\n    @property\n    def features(self) -> FeatureManager:\n        \"\"\"Manage annotations with features.\n\n        For examples, see :class:`~lamindb.Run` or :class:`~lamindb.models.FeatureManager`.\n        \"\"\"\n        from ._feature_manager import FeatureManager\n\n        return FeatureManager(self)\n\n    def query_output_artifacts(\n        self, include_recreated: bool = True\n    ) -> QuerySet[Artifact]:\n        \"\"\"Query output artifacts including recreated ones.\n\n        This runs the following query under the hood::\n\n            ln.Artifact.filter(ln.Q(run=self) | ln.Q(recreating_runs=self)).distinct()\n\n        Args:\n            include_recreated: If `True`, return both originally created\n                and recreated artifacts. If `False`, return only originally\n                created artifacts.\n\n        Returns:\n            A queryset of :class:`~lamindb.Artifact` objects.\n\n        See Also:\n            :attr:`~lamindb.Run.output_artifacts`\n                `QuerySet` of originally created artifacts.\n            :attr:`~lamindb.Run.recreated_artifacts`\n                `QuerySet` of recreated artifacts.\n        \"\"\"\n        if not include_recreated:\n            return self.output_artifacts.all()\n        else:\n            return self.output_artifacts.model.filter(\n                Q(run=self) | Q(recreating_runs=self)\n            ).distinct()\n\n    @strict_classmethod\n    def filter(\n        cls,\n        *queries,\n        **expressions,\n    ) -> QuerySet:\n        \"\"\"Query a set of artifacts.\n\n        Args:\n            *queries: `Q` expressions.\n            **expressions: Params, fields, and values passed via the Django query syntax.\n\n        See Also:\n            - Guide: :doc:`docs:registries`\n\n        Examples:\n\n            Query by fields::\n\n                ln.Run.filter(key=\"examples/my_file.parquet\")\n\n            Query by params::\n\n                ln.Run.filter(hyperparam_x=100)\n        \"\"\"\n        # from Registry metaclass\n        return type(cls).filter(cls, *queries, **expressions)\n\n\ndef _permanent_delete_runs(runs: Run | QuerySet) -> None:\n    \"\"\"Execute bulk DELETE on runs and spawn artifact cleanup. Used by QuerySet and single-run paths.\"\"\"\n    if isinstance(runs, Run):\n        db = runs._state.db or \"default\"\n        first_run_uid = runs.uid\n        artifact_ids = []\n        if runs.environment_id:\n            artifact_ids.append(runs.environment_id)\n        if runs.report_id:\n            artifact_ids.append(runs.report_id)\n        super(BaseSQLRecord, runs).delete()\n    else:\n        db = runs.db or \"default\"\n        rows = list(runs.values_list(\"uid\", \"report_id\", \"environment_id\"))\n        if rows:\n            first_run_uid = rows[0][0]\n        else:\n            return\n        artifact_ids = list({aid for r in rows for aid in r[1:3] if aid is not None})\n        super(BasicQuerySet, runs).delete()\n    if artifact_ids:\n        ids_str = \",\".join(map(str, artifact_ids))\n        instance = db if db not in (None, \"default\") else setup_settings.instance.slug\n        # spawn background subprocess to delete orphaned report/env artifacts\n        cmd: list[str] = [\n            sys.executable,\n            \"-m\",\n            \"lamindb.models._run_cleanup\",\n            \"--instance\",\n            instance,\n            \"--ids\",\n            ids_str,\n            \"--run-uid\",\n            first_run_uid,\n        ]\n        proc = subprocess.Popen(\n            cmd,\n            start_new_session=True,\n            stdout=subprocess.DEVNULL,\n            stderr=subprocess.DEVNULL,\n            env=os.environ,\n        )\n        log_path = setup_settings.cache_dir / f\"run_cleanup_logs_{first_run_uid}.txt\"\n        logger.important(\n            f\"spawned run cleanup subprocess (pid={proc.pid}): {log_path}\\n  {' '.join(cmd)}\"\n        )\n\n\nclass RunJsonValue(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    run: Run = ForeignKey(Run, CASCADE, related_name=\"links_jsonvalue\")\n    # we follow the lower() case convention rather than snake case for link models\n    jsonvalue: JsonValue = ForeignKey(\"JsonValue\", PROTECT, related_name=\"links_run\")\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of creation of record.\"\"\"\n    created_by: User = ForeignKey(\n        \"lamindb.User\", PROTECT, default=current_user_id, related_name=\"+\"\n    )\n    \"\"\"Creator of record.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"run\", \"jsonvalue\")\n\n\n# for storing artifact-like values in runs\n# compare RunRecord as opposed to RecordRun\nclass RunArtifact(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    run: Run = ForeignKey(Run, CASCADE, related_name=\"values_artifact\")\n    artifact: Artifact = ForeignKey(\"Artifact\", PROTECT, related_name=\"links_in_run\")\n    feature: Feature | None = ForeignKey(\n        \"Feature\", PROTECT, null=True, related_name=\"links_runartifact\", default=None\n    )\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"run\", \"artifact\", \"feature\")\n"
  },
  {
    "path": "lamindb/models/save.py",
    "content": "# ruff: noqa: TC004\nfrom __future__ import annotations\n\nimport os\nimport shutil\nimport traceback\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import TYPE_CHECKING\n\nfrom django.db import IntegrityError, transaction\nfrom django.utils.functional import partition\nfrom lamin_utils import logger\nfrom lamindb_setup.core.upath import LocalPathClasses, UPath\n\nfrom ..core._settings import settings\nfrom .sqlrecord import (\n    UNIQUE_FIELD_NAMES,\n    SQLRecord,\n    parse_violated_field_from_error_message,\n)\n\nif TYPE_CHECKING:\n    from collections.abc import Iterable\n\n    from .artifact import Artifact\n\n\ndef save(\n    records: Iterable[SQLRecord],\n    ignore_conflicts: bool | None = False,\n    batch_size: int = 10000,\n) -> None:\n    \"\"\"Bulk save records.\n\n    Note:\n\n        This is a much faster than saving records using ``record.save()``.\n\n    Warning:\n\n        Bulk saving neither automatically creates related records nor updates\n        existing records! Use ``record.save()`` for these use cases.\n\n    Args:\n        records: Multiple :class:`~lamindb.models.SQLRecord` objects.\n        ignore_conflicts: If `True`, do not error if some records violate a unique or another constraint.\n            However, it won't inplace update the id fields of records.\n            If you need records with ids, you need to query them from the database.\n        batch_size: Number of records to process in each batch.\n            Large batch sizes can improve performance but may lead to memory issues.\n\n    Examples:\n\n        Save a list of records:\n\n        >>> labels = [ln.ULabel(f\"Label {i}\") for i in range(10)]\n        >>> ln.save(projects)\n\n        For a single record, use ``record.save()``:\n\n        >>> transform = ln.Transform(key=\"My pipeline\")\n        >>> transform.save()\n\n        Update a single existing record:\n\n        >>> transform = ln.Transform.get(\"0Cb86EZj\")\n        >>> transform.description = \"New description\"\n        >>> transform.save()\n\n    \"\"\"\n    from .artifact import Artifact\n\n    if isinstance(records, SQLRecord):\n        raise ValueError(\"Please use record.save() if saving a single record.\")\n\n    # previously, this was all set based,\n    # but models without primary keys aren't hashable\n    # we distinguish between artifacts and non-artifacts\n    # for artifacts, we want to bulk-upload rather than upload one-by-one\n    non_artifacts, artifacts = partition(lambda r: isinstance(r, Artifact), records)\n    if non_artifacts:\n        non_artifacts_old, non_artifacts_new = partition(\n            lambda r: r._state.adding or r.pk is None, non_artifacts\n        )\n        bulk_create(\n            non_artifacts_new, ignore_conflicts=ignore_conflicts, batch_size=batch_size\n        )\n        if non_artifacts_old:\n            bulk_update(non_artifacts_old, batch_size=batch_size)\n        non_artifacts_with_parents = [\n            r for r in non_artifacts_new if hasattr(r, \"_parents\")\n        ]\n        if len(non_artifacts_with_parents) > 0:\n            # this can only happen within bionty right now!!\n            # we might extend to core lamindb later\n            from bionty.core import add_ontology\n\n            add_ontology(non_artifacts_with_parents)\n        records_with_lazy_features = [\n            record\n            for record in non_artifacts\n            if record.__class__.__name__ == \"Record\" and hasattr(record, \"_features\")\n        ]\n        if records_with_lazy_features:\n            from ._feature_manager import bulk_set_features_in_records\n\n            bulk_set_features_in_records(records_with_lazy_features)\n\n    if artifacts:\n        with transaction.atomic():\n            for record in artifacts:\n                # will switch to True after the successful upload / saving\n                if getattr(record, \"_local_filepath\", None) is not None and getattr(\n                    record, \"_to_store\", False\n                ):\n                    record._storage_ongoing = True\n                record._save_skip_storage()\n        using_key = settings._using_key\n        store_artifacts(artifacts, using_key=using_key)\n\n    # this function returns None as potentially 10k records might be saved\n    # refreshing all of them from the DB would mean a severe performance penalty\n    # 2nd reason: consistency with Django Model.save(), which also returns None\n    return None\n\n\ndef bulk_create(\n    records: Iterable[SQLRecord],\n    ignore_conflicts: bool | None = False,\n    batch_size: int = 10000,\n):\n    \"\"\"Create records in batches for safety and performance.\n\n    Args:\n        records: Iterable of SQLRecord objects to create\n        ignore_conflicts: Whether to ignore conflicts during creation\n        batch_size: Number of records to process in each batch.\n    \"\"\"\n    records_by_orm = defaultdict(list)\n    for record in records:\n        records_by_orm[record.__class__].append(record)\n\n    for registry, records_list in records_by_orm.items():\n        total_records = len(records_list)\n        model_name = registry.__name__\n        if total_records > batch_size:\n            logger.important(\n                f\"starting creation of {total_records} {model_name} records in batches of {batch_size}\"\n            )\n\n        # Process records in batches\n        for i in range(0, len(records_list), batch_size):\n            batch = records_list[i : i + batch_size]\n            batch_num = (i // batch_size) + 1\n            total_batches = (total_records + batch_size - 1) // batch_size\n\n            if total_records > batch_size:\n                logger.info(\n                    f\"processing batch {batch_num}/{total_batches} for {model_name}: {len(batch)} records\"\n                )\n            try:\n                registry.objects.bulk_create(batch, ignore_conflicts=ignore_conflicts)\n            # handle unique constraint violations due to non-default branches\n            except IntegrityError as e:\n                error_msg = str(e)\n                if any(field in error_msg for field in UNIQUE_FIELD_NAMES) and (\n                    \"UNIQUE constraint failed\" in error_msg\n                    or \"duplicate key value violates unique constraint\" in error_msg\n                ):\n                    unique_fields = parse_violated_field_from_error_message(error_msg)\n\n                    # Build tuples of unique field values for each record\n                    unique_field_values = [\n                        tuple(getattr(r, field) for field in unique_fields)\n                        for r in batch\n                    ]\n\n                    # Build Q objects for multi-field lookup\n                    from django.db.models import Q\n\n                    q_objects = Q()\n                    for values in unique_field_values:\n                        field_kwargs = {\n                            unique_fields[i]: values[i]\n                            for i in range(len(unique_fields))\n                        }\n                        q_objects |= Q(**field_kwargs)\n\n                    # Query against non-default branches\n                    pre_existing_records_not_main_branch = registry.objects.filter(\n                        q_objects\n                    ).exclude(branch_id=1)\n\n                    # Get the unique field value tuples that already exist\n                    pre_existing_value_tuples = {\n                        tuple(getattr(rec, field) for field in unique_fields)\n                        for rec in pre_existing_records_not_main_branch\n                    }\n\n                    # Records that can be saved normally (not in non-default branches)\n                    records_main_branch = [\n                        r\n                        for r in batch\n                        if tuple(getattr(r, field) for field in unique_fields)\n                        not in pre_existing_value_tuples\n                    ]\n                    save(records_main_branch)\n\n                    # Now move the pre-existing records to the main branch\n                    if pre_existing_value_tuples:\n                        unique_fields_str = \", \".join(unique_fields)\n                        logger.warning(\n                            f\"some {model_name} records with the same ({unique_fields_str}) already exist in non-default branches - moving them to the default branch\"\n                        )\n                        pre_existing_records_to_move = [\n                            r\n                            for r in batch\n                            if tuple(getattr(r, field) for field in unique_fields)\n                            in pre_existing_value_tuples\n                        ]\n                        for record in pre_existing_records_to_move:\n                            record.save()\n                else:\n                    raise e\n\n\ndef bulk_update(\n    records: Iterable[SQLRecord],\n    ignore_conflicts: bool | None = False,\n    batch_size: int = 10000,\n):\n    \"\"\"Update records in batches for safety and performance.\n\n    Args:\n        records: Iterable of SQLRecord objects to update\n        ignore_conflicts: Whether to ignore conflicts during update (currently unused but kept for consistency)\n        batch_size: Number of records to process in each batch. If None, processes all at once.\n    \"\"\"\n    records_by_orm = defaultdict(list)\n    for record in records:\n        records_by_orm[record.__class__].append(record)\n\n    for registry, records_list in records_by_orm.items():\n        total_records = len(records_list)\n        model_name = registry.__name__\n        if total_records > batch_size:\n            logger.warning(\n                f\"starting update for {total_records} {model_name} records in batches of {batch_size}\"\n            )\n\n        field_names = [\n            field.name\n            for field in registry._meta.fields\n            if (field.name != \"created_at\" and field.name != \"id\")\n        ]\n\n        # Process records in batches\n        for i in range(0, len(records_list), batch_size):\n            batch = records_list[i : i + batch_size]\n            batch_num = (i // batch_size) + 1\n            total_batches = (total_records + batch_size - 1) // batch_size\n\n            if total_records > batch_size:\n                logger.info(\n                    f\"processing batch {batch_num}/{total_batches} for {model_name}: {len(batch)} records\"\n                )\n            registry.objects.bulk_update(batch, field_names)\n\n\n# This is also used within Artifact.save()\ndef check_and_attempt_upload(\n    artifact: Artifact,\n    using_key: str | None = None,\n    access_token: str | None = None,\n    print_progress: bool = True,\n    **kwargs,\n) -> Exception | None:\n    # kwargs are propagated to .upload_from in the end\n    # if Artifact object is either newly instantiated or replace() was called on\n    # a local env it will have a _local_filepath and needs to be uploaded\n    if getattr(artifact, \"_local_filepath\", None) is not None:\n        try:\n            storage_path, cache_path = upload_artifact(\n                artifact,\n                using_key,\n                access_token=access_token,\n                print_progress=print_progress,\n                **kwargs,\n            )\n        except Exception as exception:\n            logger.warning(f\"could not upload artifact: {artifact}\")\n            # clear dangling storages if we were actually uploading or saving\n            if getattr(artifact, \"_to_store\", False):\n                # avoid root-level import of core.storage module\n                from ..core.storage import paths\n\n                artifact._clear_storagekey = paths.auto_storage_key_from_artifact(\n                    artifact\n                )  # type: ignore\n            return exception\n        # copies (if on-disk) or moves the temporary file (if in-memory) to the cache\n        if os.getenv(\"LAMINDB_MULTI_INSTANCE\") is None:\n            # this happens only after the actual upload was performed\n            # we avoid failing here in case any problems happen in copy_or_move_to_cache\n            # because the cache copying or cleanup is not absolutely necessary\n            try:\n                copy_or_move_to_cache(artifact, storage_path, cache_path)\n            except Exception as e:\n                if not str(e).startswith(\n                    \"[WinError 32] The process cannot access the file \"\n                    \"because it is being used by another process\"\n                ):\n                    # ignore WinError 32 error, this just means that the file is still open on save\n                    # it is saved at this point, so not a big deal if copy or move to cache fails\n                    # this mostly happens for run logs\n                    # just ignore without a warning\n                    logger.warning(f\"A problem with cache on saving: {e}\")\n        # after successful upload, we should remove the attribute so that another call\n        # call to save won't upload again, the user should call replace() then\n        del artifact._local_filepath\n    # returning None means proceed (either success or no action needed)\n    return None\n\n\ndef copy_or_move_to_cache(\n    artifact: Artifact, storage_path: UPath, cache_path: UPath | None\n):\n    local_path = artifact._local_filepath\n\n    # in-memory cases\n    if local_path is None or not local_path.exists():\n        return None\n\n    local_path = local_path.resolve()\n    is_dir = local_path.is_dir()\n    cache_dir = settings.cache_dir\n\n    # just delete from the cache dir if storage_path is local\n    if cache_path is None:\n        if (\n            local_path.as_posix() != storage_path.as_posix()\n            and cache_dir in local_path.parents\n        ):\n            if is_dir:\n                shutil.rmtree(local_path)\n            else:\n                local_path.unlink()\n        return None\n    # non-local storage_path further\n    if local_path != cache_path:\n        if cache_path.exists():\n            logger.important_hint(\n                f\"replacing the existing cache path {cache_path.as_posix()}\"\n            )\n            if cache_path.is_dir():\n                shutil.rmtree(cache_path)\n            else:\n                cache_path.unlink()\n        else:\n            cache_path.parent.mkdir(parents=True, exist_ok=True)\n        if cache_dir in local_path.parents:\n            local_path.replace(cache_path)\n        else:\n            if is_dir:\n                shutil.copytree(local_path, cache_path)\n            else:\n                shutil.copy(local_path, cache_path)\n    # make sure that the cached version is older than the cloud one\n    mts = datetime.now().timestamp() + 1.0\n    if is_dir:\n        files = (file for file in cache_path.rglob(\"*\") if file.is_file())\n        for file in files:\n            os.utime(file, times=(mts, mts))\n    else:\n        os.utime(cache_path, times=(mts, mts))\n\n\n# This is also used within Artifact.save()\ndef check_and_attempt_clearing(\n    artifact: Artifact,\n    raise_file_not_found_error: bool = True,\n    using_key: str | None = None,\n) -> Exception | None:\n    # this is a clean-up operation after replace() was called\n    # or if there was an exception during upload\n    if hasattr(artifact, \"_clear_storagekey\"):\n        try:\n            if artifact._clear_storagekey is not None:  # type: ignore\n                # avoid root-level import of core.storage module\n                from ..core.storage import paths\n\n                delete_msg = paths.delete_storage_using_key(\n                    artifact,\n                    artifact._clear_storagekey,  # type: ignore\n                    raise_file_not_found_error=raise_file_not_found_error,\n                    using_key=using_key,\n                )\n                if delete_msg != \"did-not-delete\":\n                    logger.success(\n                        f\"deleted stale object at storage key {artifact._clear_storagekey}\"  # type: ignore\n                    )\n                artifact._clear_storagekey = None  # type: ignore\n        except Exception as exception:\n            return exception\n    # returning None means proceed (either success or no action needed)\n    return None\n\n\ndef store_artifacts(\n    artifacts: Iterable[Artifact], using_key: str | None = None\n) -> None:\n    \"\"\"Upload artifacts in a list of database-committed artifacts to storage.\n\n    If any upload fails, subsequent artifacts are cleaned up from the DB.\n    \"\"\"\n    from .artifact import Artifact\n\n    exception: Exception | None = None\n    # because uploads might fail, we need to maintain a new list of the succeeded uploads\n    stored_artifacts = []\n\n    # upload new local artifacts\n    for artifact in artifacts:\n        # failure here sets ._clear_storagekey\n        # for cleanup below\n        exception = check_and_attempt_upload(artifact, using_key)\n        if exception is not None:\n            break\n\n        stored_artifacts += [artifact]\n        # update to show successful saving\n        # only update if _storage_ongoing was set to True before\n        # this should be a single transaction for the updates of all the artifacts\n        # but then it would just abort all artifacts, even those successfully stored before\n        # TODO: there should also be some kind of exception handling here\n        # but this requires refactoring\n        if artifact._storage_ongoing:\n            artifact._storage_ongoing = False\n            # each .save() is a separate transaction below\n            super(Artifact, artifact).save()\n        # if check_and_attempt_upload was successful\n        # then this can have only ._clear_storagekey from .replace\n        exception = check_and_attempt_clearing(\n            artifact, raise_file_not_found_error=True, using_key=using_key\n        )\n        if exception is not None:\n            logger.warning(f\"clean up of {artifact._clear_storagekey} failed\")  # type: ignore\n            break\n\n    if exception is not None:\n        # clean up metadata for artifacts not uploaded to storage\n        with transaction.atomic():\n            for artifact in artifacts:\n                if artifact not in stored_artifacts:\n                    artifact._delete_skip_storage()\n                    # clean up storage after failure in check_and_attempt_upload\n                    exception_clear = check_and_attempt_clearing(\n                        artifact, raise_file_not_found_error=False, using_key=using_key\n                    )\n                    if exception_clear is not None:\n                        logger.warning(\n                            f\"clean up of {artifact._clear_storagekey} after the upload error failed\"  # type: ignore\n                        )\n        error_message = prepare_error_message(artifacts, stored_artifacts, exception)\n        # this is bad because we're losing the original traceback\n        # needs to be refactored - also, the orginal error should be raised\n        raise RuntimeError(error_message)\n    return None\n\n\ndef prepare_error_message(records, stored_artifacts, exception) -> str:\n    if len(stored_artifacts) == 0:\n        error_message = (\n            \"No entries were uploaded or committed\"\n            \" to the database. See error message:\\n\\n\"\n        )\n    else:\n        error_message = (\n            \"The following entries have been\"\n            \" successfully uploaded and committed to the database:\\n\"\n        )\n        for record in stored_artifacts:\n            error_message += (\n                f\"- {', '.join(record.__repr__().split(', ')[:3]) + ', ...)'}\\n\"\n            )\n        error_message += \"\\nSee error message:\\n\\n\"\n    error_message += f\"{str(exception)}\\n\\n{traceback.format_exc()}\"\n    return error_message\n\n\ndef upload_artifact(\n    artifact,\n    using_key: str | None = None,\n    access_token: str | None = None,\n    print_progress: bool = True,\n    **kwargs,\n) -> tuple[UPath, UPath | None]:\n    \"\"\"Store and add file and its linked entries.\"\"\"\n    # kwargs are propagated to .upload_from in the end\n    # can't currently use  filepath_from_artifact here because it resolves to ._local_filepath\n    # avoid root-level import of core.storage module\n    from ..core.storage import paths\n\n    storage_key = paths.auto_storage_key_from_artifact(artifact)\n    storage_path, storage_settings = paths.attempt_accessing_path(\n        artifact, storage_key, using_key=using_key, access_token=access_token\n    )\n    if getattr(artifact, \"_to_store\", False):\n        logger.save(f\"storing artifact '{artifact.uid}' at '{storage_path}'\")\n        paths.store_file_or_folder(\n            artifact._local_filepath,\n            storage_path,\n            print_progress=print_progress,\n            **kwargs,\n        )\n\n    if isinstance(storage_path, LocalPathClasses):\n        cache_path = None\n    else:\n        cache_key = paths._cache_key_from_artifact_storage(artifact, storage_settings)\n        cache_path = storage_settings.cloud_to_local_no_update(\n            storage_path, cache_key=cache_key\n        )\n\n    return storage_path, cache_path\n"
  },
  {
    "path": "lamindb/models/schema.py",
    "content": "from __future__ import annotations\n\nimport warnings\nfrom typing import TYPE_CHECKING, Any, Type, overload\n\nimport numpy as np\nfrom django.db import models\nfrom django.db.models import CASCADE, PROTECT, ManyToManyField, Q\nfrom lamin_utils import logger\nfrom lamindb_setup.core import deprecated\nfrom lamindb_setup.core.hashing import HASH_LENGTH, hash_string\n\nfrom lamindb.base.fields import (\n    BooleanField,\n    CharField,\n    ForeignKey,\n    IntegerField,\n    TextField,\n)\nfrom lamindb.base.types import FieldAttr, ListLike\nfrom lamindb.base.uids import base62_16\nfrom lamindb.base.utils import class_and_instance_method\nfrom lamindb.errors import FieldValidationError, InvalidArgument\nfrom lamindb.models.feature import parse_cat_dtype\n\nfrom ..errors import ValidationError\nfrom ._describe import describe_schema, format_rich_tree\nfrom ._relations import (\n    dict_related_model_to_related_name,\n    get_related_name,\n)\nfrom .can_curate import CanCurate\nfrom .feature import (\n    Feature,\n    serialize_dtype,\n    serialize_pandas_dtype,\n)\nfrom .has_parents import _query_relatives\nfrom .query_set import QuerySet, SQLRecordList\nfrom .run import TracksRun, TracksUpdates\nfrom .sqlrecord import (\n    BaseSQLRecord,\n    HasType,\n    IsLink,\n    Registry,\n    SQLRecord,\n    _get_record_kwargs,\n    init_self_from_db,\n    update_attributes,\n)\n\nif TYPE_CHECKING:\n    import pandas as pd\n    from django.db.models.query_utils import DeferredAttribute\n\n    from .artifact import Artifact\n    from .block import SchemaBlock\n    from .project import Project\n    from .query_manager import RelatedManager\n    from .record import Record\n\n\nNUMBER_TYPE = \"num\"\nDICT_KEYS_TYPE = type({}.keys())  # type: ignore\n\n\ndef validate_features(features: list[SQLRecord]) -> SQLRecord:\n    \"\"\"Validate and return feature type.\"\"\"\n    try:\n        if len(features) == 0:\n            raise ValueError(\"Provide list of features with at least one element\")\n    except TypeError:\n        raise ValueError(\n            \"Please pass a ListLike of features, not a single feature\"\n        ) from None\n    if not hasattr(features, \"__getitem__\"):\n        raise TypeError(\"features has to be list-like\")\n    if not isinstance(features[0], SQLRecord):\n        raise TypeError(\n            \"features has to store feature records! use .from_values() otherwise\"\n        )\n    feature_types = {feature.__class__ for feature in features}\n    if len(feature_types) > 1:\n        raise TypeError(\"schema can only contain a single type\")\n    for feature in features:\n        if feature._state.adding:\n            raise ValueError(\"Can only construct feature sets from validated features\")\n    return next(iter(feature_types))  # return value in set of cardinality 1\n\n\ndef get_features_config(\n    features: list[SQLRecord] | tuple[SQLRecord, dict],\n) -> tuple[list[SQLRecord], list[tuple[SQLRecord, dict]]]:\n    \"\"\"Get features and their config from the return of feature.with_config().\"\"\"\n    features_list = []\n    configs = []\n    try:\n        for feature in features:\n            if isinstance(feature, tuple):\n                features_list.append(feature[0])\n                configs.append(feature)  # store the tuple in configs\n            else:\n                features_list.append(feature)\n        return features_list, configs  # type: ignore\n    except TypeError:\n        return features, configs  # type: ignore\n\n\nclass SchemaOptionals:\n    \"\"\"Manage and access optional features in a schema.\"\"\"\n\n    def __init__(self, schema) -> None:\n        self.schema = schema\n\n    def get_uids(self) -> list[str]:\n        \"\"\"Get the uids of the optional features.\n\n        Does **not** need an additional query to the database, while `get()` does.\n        \"\"\"\n        if (\n            self.schema._aux is not None\n            and \"af\" in self.schema._aux\n            and \"1\" in self.schema._aux[\"af\"]\n        ):\n            return self.schema._aux[\"af\"][\"1\"]\n        else:\n            return []\n\n    def get(self) -> QuerySet:\n        \"\"\"Get the optional features.\"\"\"\n        uids = self.get_uids()\n        if uids:\n            return Feature.objects.filter(uid__in=uids).order_by(\"links_schema__id\")\n        else:\n            return Feature.objects.none()  # empty QuerySet\n\n    def set(self, features: list[Feature]) -> None:\n        \"\"\"Set the optional features (overwrites whichever schemas are currently optional).\"\"\"\n        if not isinstance(features, list) or not all(\n            isinstance(f, Feature) for f in features\n        ):\n            raise TypeError(\"features must be a list of Feature records!\")\n        self.schema._aux = self.schema._aux or {}\n        if len(features) > 0:\n            self.schema._aux.setdefault(\"af\", {})[\"1\"] = [f.uid for f in features]\n\n    def remove(self, features: Feature | list[Feature]) -> None:\n        \"\"\"Make one or multiple features required by removing them from the set of optional features.\"\"\"\n        if not isinstance(features, list):\n            features = [features]\n        if not all(isinstance(f, Feature) for f in features):\n            raise TypeError(\"features must be a list of Feature records!\")\n        if len(features) > 0:\n            self.schema._aux = self.schema._aux or {}\n            if \"1\" in self.schema._aux.get(\"af\", {}):\n                for feature in features:\n                    self.schema._aux[\"af\"][\"1\"].remove(feature.uid)\n\n    def add(self, features: Feature | list[Feature]) -> None:\n        \"\"\"Make one or multiple features optional by adding them to the set of optional features.\"\"\"\n        self.schema._aux = self.schema._aux or {}\n        if not isinstance(features, list):\n            features = [features]\n        if not all(isinstance(f, Feature) for f in features):\n            raise TypeError(\"features must be a list of Feature records!\")\n        if len(features) > 0:\n            if \"1\" not in self.schema._aux.setdefault(\"af\", {}):\n                self.set(features)\n            else:\n                self.schema._aux.setdefault(\"af\", {})[\"1\"].extend(\n                    [f.uid for f in features]\n                )\n\n\nKNOWN_SCHEMAS = {  # by hash\n    \"kMi7B_N88uu-YnbTLDU-DA\": \"0000000000000000\",  # valid_features\n    \"1gocc_TJ1RU2bMwDRK-WUA\": \"0000000000000001\",  # valid_ensembl_gene_ids\n    \"UR_ozz2VI2sY8ckXop2RAg\": \"0000000000000002\",  # anndata_ensembl_gene_ids_and_valid_features_in_obs (itype='Composite')\n    \"aqGWHvyY49W_PHELUMiBMw\": \"0000000000000002\",  # anndata_ensembl_gene_ids_and_valid_features_in_obs (itype=None)\n}\n\n\nclass Schema(SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates):\n    \"\"\"Schemas of datasets such as column sets of dataframes.\n\n    .. note::\n\n        To create a schema, at least one of the following parameters must be passed:\n\n        - `features` - a list of `Feature` objects\n        - `itype` - the identifier type, e.g., `Feature` or `bt.Gene.ensembl_gene_id`\n        - `slots` - a dictionary mapping slots to :class:`~lamindb.Schema` objects, e.g., for an `AnnData`, `{\"obs\": Schema(...), \"var.T\": Schema(...)}`\n        - `is_type=True` - a *schema type* to group schemas, e.g., \"ProteinPanel\"\n\n    Args:\n        features: `list[SQLRecord] | list[tuple[Feature, dict]] | None = None` Feature\n            records, e.g., `[Feature(...), Feature(...)]` or features with their config, e.g., `[Feature(...).with_config(optional=True)]`.\n        slots: `dict[str, Schema] | None = None` A dictionary mapping slot names to :class:`~lamindb.Schema` objects to create a _composite_ schema.\n        name: `str | None = None` Name of the schema.\n        description: `str | None = None` Description of the schema.\n        itype: `str | None = None` Feature identifier type to validate against, e.g., `ln.Feature` or `bt.Gene.ensembl_gene_id`.\n            Is automatically set to the type of the passed `features`.\n        type: `Schema | None = None` Define schema types like `ln.Schema(name=\"ProteinPanel\", is_type=True)`.\n        is_type: `bool = False` Whether the schema is a type.\n        index: `Feature | None = None` A `Feature` record to validate an index of a `DataFrame` and therefore also, e.g., `AnnData` obs and var indices.\n        flexible: `bool | None = None` Whether to include any feature of the same `itype` during validation & annotation.\n            If `features` is passed, defaults to `False` so that, e.g., additional columns of a `DataFrame` encountered during validation are disregarded.\n            If `features` is not passed, defaults to `True`.\n        otype: `str | None = None` An object type to define the structure of a composite schema, e.g., `\"DataFrame\"`, `\"AnnData\"`.\n        dtype: `str | None = None` A `dtype` to assume for all features in the schema (e.g., \"num\", float, int).\n            Defaults to `None` if `itype` is `Feature`. Otherwise to `\"num\"`, e.g., if `itype` is `bt.Gene.ensembl_gene_id`.\n        minimal_set: `bool = True` Whether all passed features are required by default.\n            See :attr:`~lamindb.Schema.optionals` for more-fine-grained control.\n        maximal_set: `bool = False` Whether additional features are allowed.\n        ordered_set: `bool = False` Whether features are required to be ordered.\n        coerce: `bool | None = None` When True, attempts to coerce values to the specified dtype\n            during validation, see :attr:`~lamindb.Schema.coerce`.\n        n_members: `int | None = None` A manual way of specifying the number of features in the schema. Is inferred from `features` if passed.\n\n    See Also:\n        :meth:`~lamindb.Artifact.from_dataframe`\n            Validate & annotate a `DataFrame` with a schema.\n        :meth:`~lamindb.Artifact.from_anndata`\n            Validate & annotate an `AnnData` with a schema.\n        :meth:`~lamindb.Artifact.from_mudata`\n            Validate & annotate an `MuData` with a schema.\n        :meth:`~lamindb.Artifact.from_spatialdata`\n            Validate & annotate a `SpatialData` with a schema.\n\n    Examples:\n\n        A schema with a single required feature::\n\n            import lamindb as ln\n\n            schema = ln.Schema([ln.Feature(name=\"required_feature\", dtype=str).save()]).save()\n\n        A schema that constrains feature identifiers to be a valid feature names::\n\n            schema = ln.Schema(itype=ln.Feature)  # uses Feature.name as identifier type\n\n        Or valid Ensembl gene ids::\n\n            import bionty as bt\n\n            schema = ln.Schema(itype=bt.Gene.ensembl_gene_id)\n\n        A `flexible` schema that *requires* a single feature but *also* validates & annotates additional features with registered feature identifiers::\n\n            schema = ln.Schema(\n                [ln.Feature(name=\"required_feature\", dtype=str).save()],\n                itype=ln.Feature,\n                flexible=True,\n            ).save()\n\n        Create a schema type to group schemas::\n\n            protein_panel = ln.Schema(name=\"ProteinPanel\", is_type=True).save()\n            schema = ln.Schema(itype=bt.CellMarker, type=protein_panel).save()\n\n        Validate the `index` of a `DataFrame`::\n\n            schema = ln.Schema(\n                [ln.Feature(name=\"required_feature\", dtype=str).save()],\n                index=ln.Feature(name=\"sample\", dtype=ln.ULabel).save(),\n            ).save()\n\n        Mark a feature as `optional`::\n\n            schema = ln.Schema([\n                ln.Feature(name=\"required_feature\", dtype=str).save(),\n                ln.Feature(name=\"feature2\", dtype=int).save().with_config(optional=True),\n            ]).save()\n\n        Parse & validate feature identifier values::\n\n            schema = ln.Schema.from_values(\n                adata.var[\"ensemble_id\"],\n                field=bt.Gene.ensembl_gene_id,\n                organism=\"mouse\",\n            ).save()\n\n        Create a schema from a `DataFrame`::\n\n            df = pd.DataFrame({\"feat1\": [1, 2], \"feat2\": [3.1, 4.2], \"feat3\": [\"cond1\", \"cond2\"]})\n            schema = ln.Schema.from_dataframe(df)\n    \"\"\"\n\n    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):\n        abstract = False\n        app_label = \"lamindb\"\n        # also see raw SQL constraints for `is_type` and `type` FK validity in migrations\n\n    _name_field: str = \"name\"\n    _aux_fields: dict[str, tuple[str, type]] = {\n        \"1\": (\"optionals\", list[str]),\n        \"3\": (\"index_feature_uid\", str),\n    }\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(max_length=16, unique=True, db_index=True, editable=False)\n    \"\"\"A universal id.\"\"\"\n    name: str | None = CharField(max_length=150, null=True, db_index=True)\n    \"\"\"A name.\"\"\"\n    description: str | None = TextField(null=True)\n    \"\"\"A description.\"\"\"\n    n_members: int | None = IntegerField(null=True, default=None)\n    \"\"\"Number of features in the schema. None for type-like schemas.\"\"\"\n    coerce: bool | None = BooleanField(null=True, default=None)\n    \"\"\"Whether dtypes should be coerced during validation. None for type-like schemas.\"\"\"\n    flexible: bool | None = BooleanField(null=True, default=None)\n    \"\"\"Indicates how to handle validation and annotation in case features are not defined.\n\n    Examples:\n        Make a rigid schema flexible::\n\n            schema = ln.Schema.get(name=\"my_schema\")\n            schema.flexible = True\n            schema.save()\n\n        During schema creation::\n\n            # if you're not passing features but just defining the itype, defaults to flexible = True\n            schema = ln.Schema(itype=ln.Feature).save()\n            # schema.flexible is True\n\n            # if you're passing features, defaults to flexible = False\n            schema = ln.Schema(\n                features=[ln.Feature(name=\"my_required_feature\", dtype=int).save()],\n            )\n            # schema.flexible is False\n\n            # you can also validate & annotate features in addition to those that you're explicitly defining:\n            schema = ln.Schema(\n                features=[ln.Feature(name=\"my_required_feature\", dtype=int).save()],\n                flexible=True,\n            )\n            # schema.flexible is True\n    \"\"\"\n    type: Schema | None = ForeignKey(\"self\", PROTECT, null=True, related_name=\"schemas\")\n    \"\"\"Type of schema.\n\n    Allows to group schemas by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.\n\n    You can define types via `ln.Schema(name=\"ProteinPanel\", is_type=True)`.\n\n    Here are a few more examples for type names: `'ExpressionPanel'`, `'ProteinPanel'`, `'Multimodal'`, `'Metadata'`, `'Embedding'`.\n    \"\"\"\n    schemas: RelatedManager[Schema]\n    \"\"\"Schemas of this type (can only be non-empty if `is_type` is `True`).\"\"\"\n    itype: str | None = CharField(\n        max_length=120, db_index=True, null=True, editable=False\n    )\n    \"\"\"A field of a registry that stores feature identifier types, e.g., `'Feature.name'` or `'bionty.Gene.ensembl_gene_id'`.\n    Defaults to the default name field if a registry is passed (passing `Feature` would result in `Feature.name`).\n\n    Depending on `itype`, `.members` stores, e.g., `Feature` or `bionty.Gene` records.\n    \"\"\"\n    otype: str | None = CharField(max_length=64, db_index=True, null=True)\n    \"\"\"Default Python object type, e.g., DataFrame, AnnData.\"\"\"\n    _dtype_str: str | None = CharField(max_length=64, null=True, editable=False)\n    \"\"\"Data type, e.g., \"num\", \"float\", \"int\". Is `None` for :class:`~lamindb.Feature`.\n\n    For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.\n    \"\"\"\n    hash: str | None = CharField(\n        max_length=HASH_LENGTH, db_index=True, null=True, editable=False\n    )\n    \"\"\"A hash of the set of feature identifiers.\n\n    For a composite schema, the hash of hashes.\n    \"\"\"\n    minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)\n    \"\"\"Whether all passed features are to be considered required by default (default `True`).\n\n    Note that features that are explicitly marked as `optional` via `feature.with_config(optional=True)`\n    are **not** required even if this `minimal_set` is true.\n    \"\"\"\n    ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)\n    \"\"\"Whether features are required to be ordered (default `False`).\"\"\"\n    maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)\n    \"\"\"Whether all features present in the dataset must be in the schema (default `False`).\n\n    If `False`, additional features are allowed to be present in the dataset.\n\n    If `True`, no additional features are allowed to be present in the dataset.\n    \"\"\"\n    components: RelatedManager[Schema] = ManyToManyField(\n        \"self\", through=\"SchemaComponent\", symmetrical=False, related_name=\"composites\"\n    )\n    \"\"\"Components of this schema ← :attr:`~lamindb.Schema.composites`.\"\"\"\n    composites: RelatedManager[Schema]\n    \"\"\"The composite schemas that contains this schema as a component ← :attr:`~lamindb.Schema.components`.\n\n    For example, an `AnnData` composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.\n    \"\"\"\n    features: RelatedManager[Feature]\n    \"\"\"The features contained in the schema ← :attr:`~lamindb.Feature.schemas`.\"\"\"\n    artifacts: RelatedManager[Artifact]\n    \"\"\"The artifacts with an inferred schema that matches this schema ← :attr:`~lamindb.Artifact.schemas`.\"\"\"\n    validated_artifacts: Artifact\n    \"\"\"The artifacts that were validated against this schema ← :attr:`~lamindb.Artifact.schema`.\"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"Linked projects ← :attr:`~lamindb.Project.schemas`.\"\"\"\n    records: RelatedManager[Record]\n    \"\"\"Records that were annotated with this schema ← :attr:`~lamindb.Record.schema`.\"\"\"\n    ablocks: RelatedManager[SchemaBlock]\n    \"\"\"Attached blocks ← :attr:`~lamindb.SchemaBlock.schema`.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        features: list[SQLRecord]\n        | SQLRecordList\n        | list[tuple[Feature, dict]]\n        | None = None,\n        *,\n        slots: dict[str, Schema] | None = None,\n        name: str | None = None,\n        description: str | None = None,\n        itype: str | Registry | FieldAttr | None = None,\n        type: Schema | None = None,\n        is_type: bool = False,\n        index: Feature | None = None,\n        flexible: bool | None = None,\n        otype: str | None = None,\n        dtype: str | Type[int | float | str] | None = None,  # noqa\n        minimal_set: bool = True,\n        maximal_set: bool = False,\n        ordered_set: bool = False,\n        coerce: bool | None = None,\n        n_members: int | None = None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        if len(args) == len(self._meta.concrete_fields):\n            super().__init__(*args, **kwargs)\n            return None\n        if len(args) > 1:\n            raise ValueError(\"Only one non-keyword arg allowed: features\")\n\n        features: list[SQLRecord] | None = (\n            args[0] if args else kwargs.pop(\"features\", [])\n        )\n        index: Feature | None = kwargs.pop(\"index\", None)\n        slots: dict[str, Schema] = kwargs.pop(\"slots\", {})\n        name: str | None = kwargs.pop(\"name\", None)\n        description: str | None = kwargs.pop(\"description\", None)\n        itype: str | SQLRecord | DeferredAttribute | None = kwargs.pop(\"itype\", None)\n        flexible: bool | None = kwargs.pop(\"flexible\", None)\n        type: Feature | None = kwargs.pop(\"type\", None)\n        is_type: bool = kwargs.pop(\"is_type\", False)\n        otype: str | None = kwargs.pop(\"otype\", None)\n        dtype: str | None = kwargs.pop(\"dtype\", None)\n        minimal_set: bool = kwargs.pop(\"minimal_set\", True)\n        ordered_set: bool = kwargs.pop(\"ordered_set\", False)\n        maximal_set: bool = kwargs.pop(\"maximal_set\", False)\n        if \"coerce_dtype\" in kwargs:\n            warnings.warn(\n                \"`coerce_dtype` argument was renamed to `coerce` and will be removed in a future release.\",\n                DeprecationWarning,\n                stacklevel=2,\n            )\n            coerce_dtype = kwargs.pop(\"coerce_dtype\")\n        else:\n            coerce_dtype = kwargs.pop(\"coerce\", None)\n        using: str | None = kwargs.pop(\"using\", None)\n        if \"n\" in kwargs:\n            warnings.warn(\n                \"`n` argument was renamed to `n_members` and will be removed in a future release.\",\n                DeprecationWarning,\n                stacklevel=2,\n            )\n            n_features = kwargs.pop(\"n\")\n        else:\n            n_features = kwargs.pop(\"n_members\", None)\n        kwargs.pop(\"branch\", None)\n        kwargs.pop(\"branch_id\", 1)\n        kwargs.pop(\"space\", None)\n        kwargs.pop(\"space_id\", 1)\n        # backward compat\n        if not slots:\n            if \"components\" in kwargs:\n                logger.warning(\n                    \"`components` as a keyword argument is deprecated, please use `slots` instead\"\n                )\n                slots = kwargs.pop(\"components\")\n        if kwargs:\n            valid_keywords = \", \".join([val[0] for val in _get_record_kwargs(Schema)])\n            raise FieldValidationError(\n                f\"Only {valid_keywords} are valid keyword arguments\"\n            )\n        (\n            features,\n            validated_kwargs,\n            optional_features,\n            features_registry,\n            flexible,\n        ) = self._validate_kwargs_calculate_hash(\n            features=features,\n            index=index,\n            slots=slots,\n            name=name,\n            description=description,\n            itype=itype,\n            flexible=flexible,\n            type=type,\n            is_type=is_type,\n            otype=otype,\n            dtype=dtype,\n            minimal_set=minimal_set,\n            ordered_set=ordered_set,\n            maximal_set=maximal_set,\n            coerce=coerce_dtype,\n            n_features=n_features,\n        )\n        if not features and not slots and not is_type and not itype:\n            raise InvalidArgument(\n                \"Please pass features or slots or itype or set is_type=True\"\n            )\n        if not is_type:\n            schema = (\n                Schema.objects.using(using)\n                .filter(\n                    ~Q(branch_id=-1),\n                    hash=validated_kwargs[\"hash\"],\n                )\n                .one_or_none()\n            )\n            if schema is not None:\n                logger.important(f\"returning schema with same hash: {schema}\")\n                init_self_from_db(self, schema)\n                update_attributes(self, validated_kwargs)\n                self.optionals.set(optional_features)\n                return None\n        self._slots: dict[str, Schema] = {}\n\n        if features:\n            self._features = (get_related_name(features_registry), features)  # type: ignore\n        if slots:\n            for slot_key, component in slots.items():\n                if component._state.adding:\n                    raise InvalidArgument(\n                        f\"schema for {slot_key} {component} must be saved before use\"\n                    )\n            self._slots = slots\n\n        if validated_kwargs[\"hash\"] in KNOWN_SCHEMAS:\n            validated_kwargs[\"uid\"] = KNOWN_SCHEMAS[validated_kwargs[\"hash\"]]\n        else:\n            validated_kwargs[\"uid\"] = base62_16()\n\n        super().__init__(**validated_kwargs)\n\n    def query_schemas(self) -> QuerySet:\n        \"\"\"Query schemas of sub types.\n\n        While `.schemas` retrieves the schemas with the current type, this method\n        also retrieves sub types and the schemas with sub types of the current type.\n        \"\"\"\n        return _query_relatives([self], \"schemas\")  # type: ignore\n\n    def _validate_kwargs_calculate_hash(\n        self,\n        features: list[SQLRecord],\n        index: Feature | None,\n        slots: dict[str, Schema],\n        name: str | None,\n        description: str | None,\n        itype: str | SQLRecord | DeferredAttribute | None,\n        flexible: bool | None,\n        type: Feature | None,\n        is_type: bool,\n        otype: str | None,\n        dtype: str | None,\n        minimal_set: bool,\n        ordered_set: bool,\n        maximal_set: bool,\n        coerce: bool | None,\n        n_features: int | None,\n        optional_features_manual: list[Feature] | None = None,\n    ) -> tuple[list[Feature], dict[str, Any], list[Feature], Registry, bool]:\n        optional_features = []\n        features_registry: Registry = None\n        if itype is not None:\n            if itype != \"Composite\":\n                itype = serialize_dtype(itype, is_itype=True)\n            else:\n                warnings.warn(\n                    \"please do not pass the deprecated itype='Composite'\", stacklevel=2\n                )\n        if index is not None:\n            if not isinstance(index, Feature):\n                raise TypeError(\"index must be a Feature\")\n            features.insert(0, index)\n        if features:\n            features, configs = get_features_config(features)\n            features_registry = validate_features(features)\n            itype_compare = features_registry.__get_name_with_module__()\n            if itype is not None:\n                assert itype.startswith(itype_compare), str(itype_compare)  # noqa: S101\n            else:\n                itype = itype_compare\n            if n_features is not None:\n                if n_features != len(features):\n                    logger.important(f\"updating to n {len(features)} features\")\n            n_features = len(features)\n            if features_registry == Feature:\n                optional_features = [\n                    config[0] for config in configs if config[1].get(\"optional\")\n                ]\n                if optional_features:\n                    assert optional_features_manual is None  # noqa: S101\n                if not optional_features and optional_features_manual is not None:\n                    optional_features = optional_features_manual\n        # n_features stays None if no features passed (flexible schema)\n        if dtype is None:\n            dtype = None if itype is not None and itype == \"Feature\" else NUMBER_TYPE\n        else:\n            dtype = get_type_str(dtype)\n        if slots:\n            if otype is None:\n                raise InvalidArgument(\"Please pass otype != None for composite schemas\")\n        flexible_default = n_features is None\n        if flexible is None:\n            flexible = flexible_default\n        if itype is not None and not isinstance(itype, str):\n            itype_str = serialize_dtype(itype, is_itype=True)\n        else:\n            itype_str = itype\n        validated_kwargs = {\n            \"name\": name,\n            \"description\": description,\n            \"type\": type,\n            \"is_type\": is_type,\n            \"_dtype_str\": dtype,\n            \"otype\": otype,\n            \"n_members\": n_features,\n            \"itype\": itype_str,\n            \"minimal_set\": minimal_set,\n            \"ordered_set\": ordered_set,\n            \"maximal_set\": maximal_set,\n            \"coerce\": coerce if coerce else None,\n            \"flexible\": flexible,\n        }\n        n_features_default = (\n            None  # None means flexible schema (no fixed number of features)\n        )\n        coerce_default = False\n        aux_dict: dict[str, dict[str, bool | str | list[str]]] = {}\n\n        # optional features (key \"1\") - remains in _aux\n        if optional_features:\n            aux_dict.setdefault(\"af\", {})[\"1\"] = [f.uid for f in optional_features]\n\n        # index feature (key \"3\") - remains in _aux\n        if index is not None:\n            aux_dict.setdefault(\"af\", {})[\"3\"] = index.uid\n\n        if aux_dict:\n            validated_kwargs[\"_aux\"] = aux_dict\n        HASH_CODE = {\n            \"_dtype_str\": \"a\",\n            \"itype\": \"b\",\n            \"minimal_set\": \"c\",\n            \"ordered_set\": \"d\",\n            \"maximal_set\": \"e\",\n            \"flexible\": \"f\",\n            \"coerce_dtype\": \"g\",\n            \"n\": \"h\",\n            \"optional\": \"i\",\n            \"features_hash\": \"j\",\n            \"index\": \"k\",\n            \"slots_hash\": \"l\",\n        }\n        # we do not want pure informational annotations like otype, name, type, is_type, otype to be part of the hash\n        hash_args = [\"_dtype_str\", \"itype\", \"minimal_set\", \"ordered_set\", \"maximal_set\"]\n        list_for_hashing = [\n            f\"{HASH_CODE[arg]}={validated_kwargs[arg]}\"\n            for arg in hash_args\n            if validated_kwargs[arg] is not None\n        ]\n        # only include in hash if not default so that it's backward compatible with records for which flexible was never set\n        if flexible != flexible_default:\n            list_for_hashing.append(f\"{HASH_CODE['flexible']}={flexible}\")\n        if coerce is not None and coerce != coerce_default:\n            list_for_hashing.append(f\"{HASH_CODE['coerce_dtype']}={coerce}\")\n        if n_features is not None and n_features != n_features_default:\n            list_for_hashing.append(f\"{HASH_CODE['n']}={n_features}\")\n        if index is not None:\n            list_for_hashing.append(f\"{HASH_CODE['index']}={index.uid}\")\n        if features:\n            if optional_features:\n                feature_list_for_hashing = [\n                    feature.uid\n                    if feature not in set(optional_features)\n                    else f\"{feature.uid}({HASH_CODE['optional']})\"\n                    for feature in features\n                ]\n            else:\n                feature_list_for_hashing = [feature.uid for feature in features]\n            if not ordered_set:  # order matters if ordered_set is True, if not sort\n                feature_list_for_hashing = sorted(feature_list_for_hashing)\n            features_hash = hash_string(\":\".join(feature_list_for_hashing))\n            list_for_hashing.append(f\"{HASH_CODE['features_hash']}={features_hash}\")\n        if slots:\n            slots_list_for_hashing = sorted(\n                [f\"{key}={component.hash}\" for key, component in slots.items()]\n            )\n            slots_hash = hash_string(\":\".join(slots_list_for_hashing))\n            list_for_hashing.append(f\"{HASH_CODE['slots_hash']}={slots_hash}\")\n\n        if is_type:\n            validated_kwargs[\"hash\"] = None\n        else:\n            self._list_for_hashing = sorted(list_for_hashing)\n            schema_hash = hash_string(\":\".join(self._list_for_hashing))\n            validated_kwargs[\"hash\"] = schema_hash\n\n        return (\n            features,\n            validated_kwargs,\n            optional_features,\n            features_registry,\n            flexible,\n        )\n\n    @classmethod\n    def from_values(  # type: ignore\n        cls,\n        values: ListLike,\n        field: FieldAttr = Feature.name,\n        dtype: str | None = None,\n        name: str | None = None,\n        mute: bool = False,\n        organism: SQLRecord | str | None = None,\n        source: SQLRecord | None = None,\n        raise_validation_error: bool = True,\n    ) -> Schema:\n        \"\"\"Create feature set for validated features.\n\n        Args:\n            values: A list of values, like feature names or ids.\n            field: The field of a reference registry to map values.\n            dtype: The simple dtype.\n                Defaults to `None` if reference registry is :class:`~lamindb.Feature`,\n                defaults to `\"float\"` otherwise.\n            name: A name.\n            organism: An organism to resolve gene mapping.\n            source: A public ontology to resolve feature identifier mapping.\n            raise_validation_error: Whether to raise a validation error if some values are not valid.\n\n        Raises:\n            ValidationError: If some values are not valid.\n\n        Example:\n\n            ::\n\n                import lamindb as ln\n                import bionty as bt\n\n                features = [ln.Feature(name=feat, dtype=\"str\").save() for feat in [\"feat11\", \"feat21\"]]\n                schema = ln.Schema.from_values(features)\n\n                genes = [\"ENSG00000139618\", \"ENSG00000198786\"]\n                schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, \"float\")\n        \"\"\"\n        if not isinstance(field, FieldAttr):\n            raise TypeError(\n                \"Argument `field` must be a SQLRecord field, e.g., `Feature.name`\"\n            )\n        if len(values) == 0:\n            raise ValueError(\"Provide a list of at least one value\")\n        if isinstance(values, DICT_KEYS_TYPE):\n            values = list(values)\n        registry = field.field.model\n        if registry != Feature and dtype is None:\n            dtype = NUMBER_TYPE\n            logger.debug(\"setting feature set to 'number'\")\n        validated = registry.validate(values, field=field, mute=mute, organism=organism)\n        values_array = np.array(values)\n        validated_values = values_array[validated]\n        if validated.sum() != len(values):\n            not_validated_values = values_array[~validated]\n            msg = (\n                f\"These values could not be validated: {not_validated_values.tolist()}\\n\"\n                f\"If there are no typos, add them to their registry: {registry.__name__}\"\n            )\n            if raise_validation_error:\n                raise ValidationError(msg)\n            elif len(validated_values) == 0:\n                return None  # temporarily return None here\n        validated_features = registry.from_values(\n            validated_values,\n            field=field,\n            organism=organism,\n            source=source,\n        )\n        schema = Schema(\n            features=validated_features,\n            name=name,\n            dtype=get_type_str(dtype),\n        )\n        return schema\n\n    @classmethod\n    def from_dataframe(\n        cls,\n        df: pd.DataFrame,\n        field: FieldAttr = Feature.name,\n        name: str | None = None,\n        mute: bool = False,\n        organism: SQLRecord | str | None = None,\n        source: SQLRecord | None = None,\n    ) -> Schema | None:\n        \"\"\"Create schema for valid columns.\"\"\"\n        registry = field.field.model\n        validated = registry.validate(\n            df.columns, field=field, mute=mute, organism=organism\n        )\n        if validated.sum() == 0:\n            if not mute:\n                logger.warning(\"no validated features, skip creating schema\")\n            return None\n        if registry == Feature:\n            validated_features = Feature.from_values(  # type: ignore\n                df.columns, field=field, organism=organism\n            )\n            schema = Schema(\n                list(validated_features), name=name, dtype=None, otype=\"DataFrame\"\n            )\n        else:\n            dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()]\n            if len(set(dtypes)) != 1:\n                raise ValueError(f\"data types are heterogeneous: {set(dtypes)}\")\n            dtype = serialize_pandas_dtype(dtypes[0])\n            validated_features = registry.from_values(\n                df.columns[validated],\n                field=field,\n                organism=organism,\n                source=source,\n            )\n            schema = Schema(\n                features=list(validated_features),\n                name=name,\n                dtype=get_type_str(dtype),\n            )\n        return schema\n\n    @classmethod\n    @deprecated(\"from_dataframe\")\n    def from_df(\n        cls,\n        df: pd.DataFrame,\n        field: FieldAttr = Feature.name,\n        name: str | None = None,\n        mute: bool = False,\n        organism: SQLRecord | str | None = None,\n        source: SQLRecord | None = None,\n    ) -> Schema | None:\n        return cls.from_dataframe(df, field, name, mute, organism, source)\n\n    def save(self, *args, **kwargs) -> Schema:\n        \"\"\"Save schema.\"\"\"\n        from .save import bulk_create\n\n        features_to_delete = []\n        print_hash_mutation_warning = kwargs.pop(\"print_hash_mutation_warning\", True)\n\n        if self.pk is not None:\n            existing_features = self.members.to_list() if self.members.exists() else []\n            if hasattr(self, \"_features\"):\n                features = self._features[1]\n                if features != existing_features:\n                    features_to_delete = [\n                        f for f in existing_features if f not in features\n                    ]\n            else:\n                features = existing_features\n            index_feature = self.index\n            index_feature_id = None if index_feature is None else index_feature.id\n            _, validated_kwargs, _, _, _ = self._validate_kwargs_calculate_hash(\n                features=[  # type: ignore\n                    f\n                    for f in features\n                    if index_feature_id is None or f.id != index_feature_id\n                ],\n                index=index_feature,\n                slots=self.slots,\n                name=self.name,\n                description=self.description,\n                itype=self.itype,\n                flexible=self.flexible,\n                type=self.type,\n                is_type=self.is_type,\n                otype=self.otype,\n                dtype=self.dtype,\n                minimal_set=self.minimal_set,\n                ordered_set=self.ordered_set,\n                maximal_set=self.maximal_set,\n                coerce=self.coerce,\n                n_features=self.n_members,\n                optional_features_manual=self.optionals.get(),\n            )\n            if validated_kwargs[\"hash\"] != self.hash:\n                from .artifact import Artifact\n\n                datasets = Artifact.filter(schema=self)\n                if datasets.exists():\n                    if features_to_delete:\n                        logger.warning(\n                            f\"you're removing these features: {features_to_delete}\"\n                        )\n                    if print_hash_mutation_warning:\n                        logger.warning(\n                            f\"you updated the schema hash and might invalidate datasets that were previously validated with this schema:\\n{datasets.to_dataframe()}\"\n                        )\n                self.hash = validated_kwargs[\"hash\"]\n                self.n_members = validated_kwargs[\"n_members\"]\n        super().save(*args, **kwargs)\n        if hasattr(self, \"_slots\"):\n            # analogous to save_schema_links in core._data.py\n            # which is called to save feature sets in artifact.save()\n            links = []\n            for slot, component in self._slots.items():\n                kwargs = {\n                    \"composite_id\": self.id,\n                    \"component_id\": component.id,\n                    \"slot\": slot,\n                }\n                links.append(Schema.components.through(**kwargs))\n            bulk_create(links, ignore_conflicts=True)\n            delattr(self, \"_slots\")\n        if hasattr(self, \"_features\"):\n            assert self.n_members > 0  # noqa: S101\n            using: bool | None = kwargs.pop(\"using\", None)\n            related_name, records = self._features\n\n            # self.related_name.set(features) does **not** preserve the order\n            # but orders by the feature primary key\n            # hence we need the following more complicated logic\n            through_model = getattr(self, related_name).through\n            if self.itype == \"Composite\":\n                related_model_split = [\"Feature\"]\n            else:\n                related_model_split = parse_cat_dtype(self.itype, is_itype=True)[\n                    \"registry_str\"\n                ].split(\".\")\n            if len(related_model_split) == 1:\n                related_field = related_model_split[0].lower()\n            else:\n                related_field = related_model_split[1].lower()\n            related_field_id = f\"{related_field}_id\"\n            links = [\n                through_model(**{\"schema_id\": self.id, related_field_id: record.id})\n                for record in records\n            ]\n            through_model.objects.using(using).bulk_create(links, ignore_conflicts=True)\n            getattr(self, related_name).remove(*features_to_delete)\n            delattr(self, \"_features\")\n\n        return self\n\n    @property\n    def members(self) -> QuerySet:\n        \"\"\"A queryset for the individual records in the feature set underlying the schema.\n\n        Unlike the many-to-many fields `schema.features`, `schema.genes`, `schema.proteins`, `.members`\n\n            1. returns an ordered `QuerySet` if the schema is saved or a `SQLRecordList` if the schema is unsaved\n            2. doesn't require knowledge of the registry storing the feature identifiers (`ln.Feature`, `bt.Gene`, `bt.Protein`, etc.)\n            3. works for a dynamically created (unsaved) schema\n        \"\"\"\n        if self._state.adding:\n            # this should return a queryset and not a list...\n            # need to fix this\n            return SQLRecordList(self._features[1])  # type: ignore\n        if self.itype == \"Composite\" or self.is_type:\n            return Feature.objects.none()\n        related_name = self._get_related_name()\n        if related_name is None:\n            related_name = \"features\"\n        related_manager = self.__getattribute__(related_name)\n        through_model = related_manager.through\n        using = self._state.db\n        related_fk_name = next(\n            field.name\n            for field in through_model._meta.fields\n            if isinstance(field, models.ForeignKey) and field.name != \"schema\"\n        )\n        # Avoid the previous simple `order_by(\"links_schema__id\")` on the related\n        # manager: a member can be linked to many schemas, and reverse-join ordering\n        # can become ambiguous across DB backends (SQLite vs Postgres). Instead, we\n        # order through rows constrained to this schema and preserve that exact order.\n        member_ids = list(\n            through_model.objects.using(using)\n            .filter(schema_id=self.id)\n            .order_by(\"id\")\n            .values_list(f\"{related_fk_name}_id\", flat=True)\n        )\n        if not member_ids:\n            return related_manager.model.objects.using(using).none()\n        preserved_order = models.Case(\n            *[\n                models.When(id=member_id, then=models.Value(idx))\n                for idx, member_id in enumerate(member_ids)\n            ],\n            output_field=models.IntegerField(),\n        )\n        # Order by ids from the through table constrained to this schema to avoid\n        # ambiguous reverse-join ordering when a member is linked to many schemas.\n        return (\n            related_manager.model.objects.using(using)\n            .filter(id__in=member_ids)\n            .order_by(preserved_order)\n        )\n\n    @property\n    def dtype(self) -> str | None:\n        \"\"\"The `dtype` for all features in the schema.\"\"\"\n        return self._dtype_str\n\n    @dtype.setter\n    def dtype(self, value: str | None) -> None:\n        self._dtype_str = value\n\n    @property\n    @deprecated(\"coerce\")\n    def coerce_dtype(self) -> bool | None:\n        \"\"\"Alias for coerce (backward compatibility).\"\"\"\n        return self.coerce\n\n    @coerce_dtype.setter\n    def coerce_dtype(self, value: bool | None) -> None:\n        self.coerce = value\n\n    @property\n    @deprecated(\"n_members\")\n    def n(self) -> int | None:\n        \"\"\"Alias for n_members (backward compatibility).\"\"\"\n        return self.n_members\n\n    @n.setter\n    def n(self, value: int | None) -> None:\n        self.n_members = value\n\n    @property\n    def index(self) -> None | Feature:\n        \"\"\"The feature configured to act as index.\n\n        To unset it, set `schema.index` to `None`.\n        \"\"\"\n        if self._index_feature_uid is None:\n            return None\n\n        if hasattr(self, \"_features\"):\n            _, features = self._features\n            for feature in features:\n                if feature.uid == self._index_feature_uid:\n                    return feature\n\n        return self.features.get(uid=self._index_feature_uid)\n\n    @index.setter\n    def index(self, value: None | Feature) -> None:\n        if value is None:\n            current_index = self.index\n            self.features.remove(current_index)\n            self._index_feature_uid = value\n        else:\n            self.features.add(value)\n            self._index_feature_uid = value.uid\n\n    @property\n    def _index_feature_uid(self) -> None | str:\n        \"\"\"The uid of the index feature.\"\"\"\n        if self._aux is not None and \"af\" in self._aux and \"3\" in self._aux[\"af\"]:\n            return self._aux[\"af\"][\"3\"]\n        else:\n            return None\n\n    @_index_feature_uid.setter\n    def _index_feature_uid(self, value: str | None) -> None:\n        self._aux = self._aux or {}\n        if value is None:\n            self._aux.get(\"af\", {}).pop(\"3\")\n        else:\n            self._aux.setdefault(\"af\", {})[\"3\"] = value\n\n    @property\n    def slots(self) -> dict[str, Schema]:\n        \"\"\"Slots.\n\n        Examples:\n\n            ::\n\n                # define composite schema\n                anndata_schema = ln.Schema(\n                    name=\"mini_immuno_anndata_schema\",\n                    otype=\"AnnData\",\n                    slots={\"obs\": obs_schema, \"var\": var_schema},\n                ).save()\n\n                # access slots\n                anndata_schema.slots\n                #> {'obs': <Schema: obs_schema>, 'var': <Schema: var_schema>}\n        \"\"\"\n        if hasattr(self, \"_slots\"):\n            return self._slots\n        self._slots = {\n            link.slot: link.component\n            for link in self.components.through.filter(composite_id=self.id)\n        }\n        return self._slots\n\n    @property\n    def optionals(self) -> SchemaOptionals:\n        \"\"\"Manage optional features.\n\n        Example:\n\n            ::\n\n                # a schema with optional \"sample_name\"\n                schema_optional_sample_name = ln.Schema(\n                    features=[\n                        ln.Feature(name=\"sample_id\", dtype=str).save(),  # required\n                        ln.Feature(name=\"sample_name\", dtype=str).save().with_config(optional=True),  # optional\n                    ],\n                ).save()\n\n                # raise ValidationError since `sample_id` is required\n                ln.curators.DataFrameCurator(\n                    pd.DataFrame(\n                        {\n                        \"sample_name\": [\"Sample 1\", \"Sample 2\"],\n                        }\n                    ),\n                    schema=schema_optional_sample_name).validate()\n                )\n\n                # passes because an optional column is missing\n                ln.curators.DataFrameCurator(\n                    pd.DataFrame(\n                        {\n                        \"sample_id\": [\"sample1\", \"sample2\"],\n                        }\n                    ),\n                    schema=schema_optional_sample_name).validate()\n                )\n        \"\"\"\n        return SchemaOptionals(self)\n\n    def add_optional_features(self, features: list[Feature]) -> None:\n        \"\"\"Add optional features to the schema.\"\"\"\n        self.features.add(*features)\n        self.optionals.add(features)\n        self.save(print_hash_mutation_warning=False)\n\n    def remove_optional_features(self, features: list[Feature]) -> None:\n        \"\"\"Remove optional features from the schema.\"\"\"\n        optional_features = self.optionals.get()\n        for feature in features:\n            assert feature in optional_features, f\"Feature {feature} is not optional\"\n        self.features.remove(*features)\n        self.optionals.remove(features)\n        self.save(print_hash_mutation_warning=False)\n\n    @class_and_instance_method\n    def describe(cls_or_self, return_str: bool = False) -> None | str:\n        \"\"\"Describe schema.\"\"\"\n        if isinstance(cls_or_self, type):\n            return type(cls_or_self).describe(cls_or_self)  # type: ignore\n        if cls_or_self.pk is None:\n            raise ValueError(\"Schema must be saved before describing\")\n        tree = describe_schema(cls_or_self)\n        for slot, schema in cls_or_self.slots.items():\n            tree.add(describe_schema(schema, slot=slot))\n        return format_rich_tree(tree, return_str=return_str)\n\n\ndef get_type_str(dtype: str | None) -> str | None:\n    if dtype is not None:\n        type_str = dtype.__name__ if not isinstance(dtype, str) else dtype  # type: ignore\n    else:\n        type_str = None\n    return type_str\n\n\ndef _get_related_name(self: Schema) -> str | None:\n    related_models = dict_related_model_to_related_name(self, instance=self._state.db)\n    if self.itype:\n        related_name = related_models.get(\n            parse_cat_dtype(self.itype, is_itype=True)[\"registry_str\"]\n        )\n        return related_name\n    return None\n\n\nclass SchemaFeature(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    schema: Schema = ForeignKey(Schema, CASCADE, related_name=\"links_feature\")\n    feature: Feature = ForeignKey(Feature, PROTECT, related_name=\"links_schema\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"schema\", \"feature\")\n\n\nclass ArtifactSchema(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    artifact: Artifact = ForeignKey(\"Artifact\", CASCADE, related_name=\"_links_schema\")\n    schema: Schema = ForeignKey(Schema, PROTECT, related_name=\"_links_artifact\")\n    slot: str | None = CharField(null=True)\n    feature_ref_is_semantic: bool | None = BooleanField(null=True)\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = ((\"artifact\", \"schema\"), (\"artifact\", \"slot\"))\n\n\nclass SchemaComponent(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    composite: Schema = ForeignKey(Schema, CASCADE, related_name=\"links_component\")\n    component: Schema = ForeignKey(Schema, PROTECT, related_name=\"links_composite\")\n    slot: str | None = CharField(null=True)\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = ((\"composite\", \"slot\", \"component\"), (\"composite\", \"slot\"))\n\n\nSchema._get_related_name = _get_related_name\n\n\n# PostgreSQL migration helpers for auxiliary fields\n# These are used by migrations to efficiently migrate data from _aux to Django fields\n\n\ndef migrate_auxiliary_fields_postgres(schema_editor) -> None:\n    \"\"\"Migrate _aux['af'] fields to Django fields using PostgreSQL raw SQL.\n\n    This efficiently migrates auxiliary fields for all affected models:\n\n    **Artifact:**\n    - _save_completed from _aux['af']['0']\n\n    **Run:**\n    - cli_args from _aux['af']['0']\n\n    **Feature:**\n    - default_value from _aux['af']['0']\n    - nullable from _aux['af']['1'] (default: True)\n    - coerce from _aux['af']['2'] (default: False)\n    - For type features (is_type=True), all values are set to NULL\n\n    **Schema:**\n    - coerce from _aux['af']['0']\n    - flexible from _aux['af']['2'] (or computed from n_members)\n    - n_members (converted from negative to NULL)\n    - For type schemas (is_type=True), all values are set to NULL\n    - Keys '1' (optionals) and '3' (index_feature_uid) are preserved in _aux\n    \"\"\"\n    # Artifact: migrate _save_completed from _aux->'af'->'0'\n    schema_editor.execute(\"\"\"\n        UPDATE lamindb_artifact\n        SET _save_completed = (_aux->'af'->>'0')::boolean,\n            _aux = CASE\n                WHEN _aux->'af' IS NOT NULL THEN\n                    CASE\n                        WHEN _aux - 'af' = '{}'::jsonb THEN NULL\n                        ELSE _aux - 'af'\n                    END\n                ELSE _aux\n            END\n        WHERE _aux IS NOT NULL AND _aux->'af' IS NOT NULL\n    \"\"\")\n\n    # Run: migrate cli_args from _aux->'af'->'0'\n    schema_editor.execute(\"\"\"\n        UPDATE lamindb_run\n        SET cli_args = _aux->'af'->>'0',\n            _aux = CASE\n                WHEN _aux - 'af' = '{}'::jsonb THEN NULL\n                ELSE _aux - 'af'\n            END\n        WHERE _aux IS NOT NULL AND _aux ? 'af'\n    \"\"\")\n\n    # Feature: migrate default_value, nullable, coerce\n    # For type features: set all to NULL\n    schema_editor.execute(\"\"\"\n        UPDATE lamindb_feature\n        SET default_value = NULL,\n            nullable = NULL,\n            coerce = NULL,\n            _aux = CASE\n                WHEN _aux->'af' IS NOT NULL THEN\n                    CASE\n                        WHEN _aux - 'af' = '{}'::jsonb THEN NULL\n                        ELSE _aux - 'af'\n                    END\n                ELSE _aux\n            END\n        WHERE is_type = TRUE\n    \"\"\")\n    # For regular features: migrate values with defaults\n    schema_editor.execute(\"\"\"\n        UPDATE lamindb_feature\n        SET default_value = _aux->'af'->'0',\n            nullable = COALESCE((_aux->'af'->>'1')::boolean, TRUE),\n            coerce = COALESCE((_aux->'af'->>'2')::boolean, FALSE),\n            _aux = CASE\n                WHEN _aux->'af' IS NOT NULL THEN\n                    CASE\n                        WHEN _aux - 'af' = '{}'::jsonb THEN NULL\n                        ELSE _aux - 'af'\n                    END\n                ELSE _aux\n            END\n        WHERE is_type = FALSE OR is_type IS NULL\n    \"\"\")\n\n    # Schema: migrate coerce, flexible, n_members\n    # For type schemas: set all to NULL\n    schema_editor.execute(\"\"\"\n        UPDATE lamindb_schema\n        SET coerce = NULL,\n            flexible = NULL,\n            n_members = NULL,\n            _aux = CASE\n                WHEN _aux->'af' IS NOT NULL THEN\n                    CASE\n                        WHEN ((_aux->'af') #- ARRAY['0'] #- ARRAY['2']) = '{}'::jsonb THEN\n                            CASE WHEN (_aux #- ARRAY['af']) = '{}'::jsonb THEN NULL ELSE _aux #- ARRAY['af'] END\n                        ELSE jsonb_set(_aux #- ARRAY['af'], '{af}', (_aux->'af') #- ARRAY['0'] #- ARRAY['2'])\n                    END\n                ELSE _aux\n            END\n        WHERE is_type = TRUE\n    \"\"\")\n    # For regular schemas: migrate values\n    # Keep '1' (optionals) and '3' (index_feature_uid) in _aux\n    schema_editor.execute(\"\"\"\n        UPDATE lamindb_schema\n        SET coerce = (_aux->'af'->>'0')::boolean,\n            flexible = COALESCE(\n                (_aux->'af'->>'2')::boolean,\n                n_members IS NULL OR n_members < 0\n            ),\n            n_members = CASE WHEN n_members < 0 THEN NULL ELSE n_members END,\n            _aux = CASE\n                WHEN _aux->'af' IS NOT NULL THEN\n                    CASE\n                        WHEN ((_aux->'af') #- ARRAY['0'] #- ARRAY['2']) = '{}'::jsonb THEN\n                            CASE WHEN (_aux #- ARRAY['af']) = '{}'::jsonb THEN NULL ELSE _aux #- ARRAY['af'] END\n                        ELSE jsonb_set(\n                            CASE WHEN (_aux #- ARRAY['af']) = '{}'::jsonb THEN '{}'::jsonb ELSE _aux #- ARRAY['af'] END,\n                            '{af}',\n                            (_aux->'af') #- ARRAY['0'] #- ARRAY['2']\n                        )\n                    END\n                ELSE _aux\n            END\n        WHERE is_type = FALSE OR is_type IS NULL\n    \"\"\")\n"
  },
  {
    "path": "lamindb/models/sqlrecord.py",
    "content": "from __future__ import annotations\n\nimport builtins\nimport gzip\nimport inspect\nimport os\nimport re\nimport shutil\nimport sys\nfrom collections import defaultdict\nfrom itertools import chain\nfrom pathlib import Path\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Literal,\n    NamedTuple,\n    TypeVar,\n    overload,\n)\n\nimport dj_database_url\nimport lamindb_setup as ln_setup\nfrom django.core.exceptions import ValidationError as DjangoValidationError\nfrom django.db import IntegrityError, ProgrammingError, connections, models, transaction\nfrom django.db.models import CASCADE, DEFERRED, PROTECT, Field, Manager, QuerySet\nfrom django.db.models import ForeignKey as django_ForeignKey\nfrom django.db.models.base import ModelBase\nfrom django.db.models.fields.related import (\n    ManyToManyField,\n    ManyToManyRel,\n    ManyToOneRel,\n)\nfrom django.db.models.functions import Lower\nfrom lamin_utils import colors, logger\nfrom lamindb_setup import settings as setup_settings\nfrom lamindb_setup._connect_instance import (\n    INSTANCE_NOT_FOUND_MESSAGE,\n    InstanceNotFoundError,\n    get_owner_name_from_identifier,\n    load_instance_settings,\n    update_db_using_local,\n)\nfrom lamindb_setup.core._docs import doc_args\nfrom lamindb_setup.core._hub_core import connect_instance_hub\nfrom lamindb_setup.core._settings_store import instance_settings_file\nfrom lamindb_setup.core.django import DBToken, db_token_manager\nfrom upath import UPath\n\nfrom lamindb.base.users import current_user_id\nfrom lamindb.base.utils import class_and_instance_method, deprecated\n\nfrom ..base.fields import (\n    BooleanField,\n    CharField,\n    DateTimeField,\n    ForeignKey,\n    JSONField,\n    TextField,\n)\nfrom ..base.types import (\n    BRANCH_CODE_TO_STATUS,\n    BRANCH_STATUS_TO_CODE,\n    BranchStatus,\n    FieldAttr,\n    StrField,\n)\nfrom ..base.uids import base62_12\nfrom ..errors import (\n    FieldValidationError,\n    NoWriteAccess,\n    ValidationError,\n)\nfrom ._is_versioned import IsVersioned, _adjust_is_latest_when_deleting_is_versioned\nfrom .query_manager import QueryManager, _lookup, _search\n\nif TYPE_CHECKING:\n    from datetime import datetime\n\n    import pandas as pd\n\n    from .block import BranchBlock, SpaceBlock\n    from .project import Project\n    from .query_manager import RelatedManager\n    from .query_set import SQLRecordList\n    from .run import Run, User\n    from .ulabel import ULabel\n\n\nT = TypeVar(\"T\", bound=\"SQLRecord\")\nIPYTHON = getattr(builtins, \"__IPYTHON__\", False)\nUNIQUE_FIELD_NAMES = {\n    \"root\",\n    \"ontology_id\",\n    \"uid\",\n    \"scientific_name\",\n    \"ensembl_gene_id\",\n    \"uniprotkb_id\",\n}\nBRANCH_SENSITIVE_BLOCK_MODEL_NAMES = frozenset(\n    {\n        \"RecordBlock\",\n        \"ArtifactBlock\",\n        \"TransformBlock\",\n        \"CollectionBlock\",\n        \"RunBlock\",\n        \"SchemaBlock\",\n        \"FeatureBlock\",\n        \"ProjectBlock\",\n        \"ULabelBlock\",\n        \"SpaceBlock\",\n    }\n)\n\n\ndef _is_branch_sensitive_model(model: type[BaseSQLRecord]) -> bool:\n    return (\n        issubclass(model, SQLRecord) and model.__name__ not in {\"Storage\", \"Source\"}\n    ) or model.__name__ in BRANCH_SENSITIVE_BLOCK_MODEL_NAMES\n\n\n# -------------------------------------------------------------------------------------\n# A note on required fields at the SQLRecord level\n#\n# As Django does most of its validation on the Form-level, it doesn't offer functionality\n# for validating the integrity of an SQLRecord object upon instantation (similar to pydantic)\n#\n# For required fields, we define them as commonly done on the SQL level together\n# with a validator in SQLRecord (validate_required_fields)\n#\n# This goes against the Django convention, but goes with the SQLModel convention\n# (Optional fields can be null on the SQL level, non-optional fields cannot)\n#\n# Due to Django's convention where CharFieldAttr has pre-configured (null=False, default=\"\"), marking\n# a required field necessitates passing `default=None`. Without the validator it would trigger\n# an error at the SQL-level, with it, it triggers it at instantiation\n\n# -------------------------------------------------------------------------------------\n# A note on class and instance methods of core SQLRecord\n#\n# All of these are defined and tested within lamindb, in files starting with _{orm_name}.py\n\n# -------------------------------------------------------------------------------------\n# A note on maximal lengths of char fields\n#\n# 100 characters:\n#     \"Raindrops pitter-pattered on the windowpane, blurring the\"\n#     \"city lights outside, curled up with a mug.\"\n# A good maximal length for a name (title).\n#\n# 150 characters: We choose this for name maximal length because some users like long names.\n#\n# 255 characters:\n#     \"In creating a precise 255-character paragraph, one engages in\"\n#     \"a dance of words, where clarity meets brevity. Every syllable counts,\"\n#     \"illustrating the skill in compact expression, ensuring the essence of the\"\n#     \"message shines through within the exacting limit.\"\n\n\nclass IsLink:\n    pass\n\n\nclass HasType(models.Model):\n    \"\"\"Mixin for registries that have a hierarchical `type` assigned.\n\n    Such registries have a `.type` foreign key pointing to themselves.\n\n    A `type` hence allows hierarchically grouping records under types.\n\n    For instance, using the example of `ln.Record`::\n\n        experiment_type = ln.Record(name=\"Experiment\", is_type=True).save()\n        experiment1 = ln.Record(name=\"Experiment 1\", type=experiment_type).save()\n        experiment2 = ln.Record(name=\"Experiment 2\", type=experiment_type).save()\n    \"\"\"\n\n    class Meta:\n        abstract = True\n\n    is_type: bool = BooleanField(default=False, db_default=False, db_index=True)\n    \"\"\"Indicates if record is a `type`.\n\n    For example, if a record \"Compound\" is a `type`, the actual compounds \"darerinib\", \"tramerinib\", would be instances of that `type`.\n    \"\"\"\n\n    def query_types(self) -> SQLRecordList:\n        \"\"\"Query types of a record recursively.\n\n        While `.type` retrieves the `type`, this method\n        retrieves all super types of that `type`::\n\n            # Create type hierarchy\n            type1 = model_class(name=\"Type1\", is_type=True).save()\n            type2 = model_class(name=\"Type2\", is_type=True, type=type1).save()\n            type3 = model_class(name=\"Type3\", is_type=True, type=type2).save()\n\n            # Create a record with type3\n            record = model_class(name=f\"{model_name}3\", type=type3).save()\n\n            # Query super types\n            super_types = record.query_types()\n            assert super_types[0] == type3\n            assert super_types[1] == type2\n            assert super_types[2] == type1\n        \"\"\"\n        from .has_parents import _query_ancestors_of_fk\n\n        return _query_ancestors_of_fk(self, \"type\")  # type: ignore\n\n\ndef deferred_attribute__repr__(self):\n    return f\"FieldAttr({self.field.model.__name__}.{self.field.name})\"\n\n\ndef unique_constraint_error_in_error_message(error_msg: str) -> bool:\n    \"\"\"Check if the error message indicates a unique constraint violation.\"\"\"\n    return (\n        \"UNIQUE constraint failed\" in error_msg  # SQLite\n        or \"duplicate key value violates unique constraint\" in error_msg  # Postgre\n    )\n\n\ndef parse_violated_field_from_error_message(error_msg: str) -> list[str] | None:\n    # Even if the model has multiple fields with unique=True,\n    # Django will only raise an IntegrityError for one field at a time\n    # - whichever constraint is violated first during the database insert/update operation.\n    if unique_constraint_error_in_error_message(error_msg):\n        if \"UNIQUE constraint failed\" in error_msg:  # sqlite\n            constraint_field = (\n                error_msg.removeprefix(\"UNIQUE constraint failed: \")\n                .split(\", \")[0]\n                .split(\".\")[-1]\n            )\n            return [constraint_field]\n        else:  # postgres\n            # Extract constraint name from double quotes\n            constraint_name = error_msg.split('\"')[1]\n\n            # Check if it's a multi-column constraint (contains multiple field names)\n            # Format: tablename_field1_field2_..._hash_uniq\n            if \"_uniq\" in constraint_name:\n                # Remove '_uniq' suffix first\n                constraint_name = constraint_name.removesuffix(\"_uniq\")\n\n                # Remove hash (8 hex characters at the end)\n                parts = constraint_name.split(\"_\")\n                if len(parts[-1]) == 8 and all(\n                    c in \"0123456789abcdef\" for c in parts[-1]\n                ):\n                    constraint_name = \"_\".join(parts[:-1])\n\n                # Remove table name prefix (e.g., \"bionty_ethnicity_\")\n                # Table name is typically the first 2 parts for app_model format\n                parts = constraint_name.split(\"_\")\n                if len(parts) > 2:\n                    # Assume first 2 parts are table name (e.g., \"bionty_ethnicity\")\n                    field_string = \"_\".join(parts[2:])\n                else:\n                    field_string = constraint_name\n\n                # Now parse the fields from DETAIL line\n                # DETAIL: Key (name, ontology_id)=(South Asian, HANCESTRO:0006) already exists.\n                if \"Key (\" in error_msg:\n                    fields_part = error_msg.split(\"Key (\")[1].split(\")=\")[0]\n                    fields = [f.strip() for f in fields_part.split(\",\")]\n                    return fields\n\n                # Fallback if DETAIL line not available\n                return [field_string]\n            else:\n                # Single field constraint (ends with _key)\n                constraint_field = constraint_name.removesuffix(\"_key\").split(\"_\")[-1]\n                return [constraint_field]\n\n    return None\n\n\nFieldAttr.__repr__ = deferred_attribute__repr__  # type: ignore\n\n\nclass ValidateFields:\n    pass\n\n\ndef is_approx_pascal_case(s: str) -> bool:\n    \"\"\"Check if the last component of a dotted string is in PascalCase.\n\n    Args:\n        s: The string to check\n    \"\"\"\n    if \"[\" in s:  # this is because we allow types of form 'script[test_script.py]'\n        return True\n    last_component = s.split(\".\")[-1]\n\n    return last_component[:1].isupper() and \"_\" not in last_component\n\n\ndef init_self_from_db(self: SQLRecord, existing_record: SQLRecord):\n    from .run import current_run\n\n    new_args = [\n        getattr(existing_record, field.attname) for field in self._meta.concrete_fields\n    ]\n    super(self.__class__, self).__init__(*new_args)\n    self._state.adding = False  # mimic from_db\n    self._state.db = \"default\"\n    # if run was not set on the existing record, set it to the current_run\n    if hasattr(self, \"run_id\") and self.run_id is None and current_run() is not None:\n        logger.warning(f\"run was not set on {self}, setting to current run\")\n        self.run = current_run()\n\n\ndef update_attributes(record: SQLRecord, attributes: dict[str, str]):\n    for key, value in attributes.items():\n        if getattr(record, key) != value and value is not None:\n            if key not in {\"uid\", \"_dtype_str\", \"otype\", \"hash\"}:\n                logger.warning(f\"updated {key} from {getattr(record, key)} to {value}\")\n                setattr(record, key, value)\n            else:\n                hash_message = (\n                    \"recomputing on .save()\"\n                    if key == \"hash\"\n                    else f\"keeping {getattr(record, key)}\"\n                )\n                logger.debug(\n                    f\"ignoring tentative value {value} for {key}, {hash_message}\"\n                )\n\n\ndef validate_literal_fields(record: SQLRecord, kwargs) -> None:\n    \"\"\"Validate all Literal type fields in a record.\n\n    Args:\n        record: record being validated\n\n    Raises:\n        ValidationError: If any field value is not in its Literal's allowed values\n    \"\"\"\n    if isinstance(record, IsLink):\n        return None\n    if record.__class__.__name__ in \"Feature\":\n        return None\n    from lamindb.base.types import ArtifactKind, Dtype, TransformKind\n\n    types = {\n        \"TransformKind\": TransformKind,\n        \"ArtifactKind\": ArtifactKind,\n        \"Dtype\": Dtype,\n    }\n    errors = {}\n    annotations = getattr(record.__class__, \"__annotations__\", {})\n    for field_name, annotation in annotations.items():\n        if field_name not in kwargs or kwargs[field_name] is None:\n            continue\n        value = kwargs[field_name]\n        if str(annotation) in types:\n            annotation = types[annotation]\n        if not hasattr(annotation, \"__origin__\"):\n            continue\n        literal_type = annotation if annotation.__origin__ is Literal else None\n        if literal_type is None:\n            continue\n        valid_values = set(literal_type.__args__)\n        if value not in valid_values:\n            errors[field_name] = (\n                f\"{field_name}: {colors.yellow(value)} is not a valid value\"\n                f\"\\n    → Valid values are: {colors.green(', '.join(sorted(valid_values)))}\"\n            )\n    if errors:\n        message = \"\\n  \"\n        for _, error in errors.items():\n            message += error + \"\\n  \"\n        raise FieldValidationError(message)\n\n\ndef validate_fields(record: SQLRecord, kwargs):\n    from lamindb.models import (\n        Artifact,\n        Collection,\n        Feature,\n        Run,\n        Schema,\n        Transform,\n        ULabel,\n    )\n\n    # validate required fields\n    # a \"required field\" is a Django field that has `null=False, default=None`\n    required_fields = {\n        k.name for k in record._meta.fields if not k.null and k.default is None\n    }\n    required_fields_not_passed = {k: None for k in required_fields if k not in kwargs}\n    kwargs.update(required_fields_not_passed)\n    missing_fields = [\n        k for k, v in kwargs.items() if v is None and k in required_fields\n    ]\n    if missing_fields:\n        raise FieldValidationError(f\"{missing_fields} are required.\")\n    # ensure the exact length of the internal uid for core entities\n    if \"uid\" in kwargs and record.__class__ in {\n        Artifact,\n        Collection,\n        Transform,\n        Run,\n        ULabel,\n        Feature,\n        Schema,\n    }:\n        uid_max_length = record.__class__._meta.get_field(\n            \"uid\"\n        ).max_length  # triggers FieldDoesNotExist\n        if len(kwargs[\"uid\"]) != uid_max_length:  # triggers KeyError\n            if not (\n                record.__class__ is Schema and len(kwargs[\"uid\"]) == 16\n            ):  # no error for schema\n                raise ValidationError(\n                    f\"`uid` must be exactly {uid_max_length} characters long, got {len(kwargs['uid'])}.\"\n                )\n    # validate is_type\n    if \"is_type\" in kwargs and \"name\" in kwargs and kwargs[\"is_type\"]:\n        is_approx_pascal_case(kwargs[\"name\"])\n    if (\n        \"type\" in kwargs\n        and isinstance(kwargs[\"type\"], HasType)\n        and not kwargs[\"type\"].is_type\n    ):\n        object_name = record.__class__.__name__.lower()\n        raise ValueError(\n            f\"You can only assign a {object_name} with `is_type=True` as `type` to another {object_name}, but this doesn't have it: {kwargs['type']}\"\n        )\n    # validate literals\n    validate_literal_fields(record, kwargs)\n\n\ndef suggest_records_with_similar_names(\n    record: SQLRecord, name_field: str, kwargs\n) -> SQLRecord | None:\n    \"\"\"Returns a record if found exact match, otherwise None.\n\n    Logs similar matches if found.\n    \"\"\"\n    if kwargs.get(name_field) is None or not isinstance(kwargs.get(name_field), str):\n        return None\n    # need to perform an additional request to find the exact match\n    # previously, this was inferred from the truncated/fuzzy search below\n    # but this isn't reliable: https://laminlabs.slack.com/archives/C04FPE8V01W/p1737812808563409\n    # the below needs to be .first() because there might be multiple records with the same\n    # name field in case the record is versioned (e.g. for Transform key)\n    if isinstance(record, HasType):\n        if kwargs.get(\"type\", None) is None:\n            subset = record.__class__.filter(type__isnull=True)\n        else:\n            subset = record.__class__.filter(type=kwargs[\"type\"])\n    else:\n        subset = record.__class__\n    exact_match = subset.filter(**{name_field: kwargs[name_field]}).first()\n    if exact_match is not None:\n        return exact_match\n    queryset = _search(\n        subset,\n        kwargs[name_field],\n        field=name_field,\n        truncate_string=True,\n        limit=3,\n    )\n    if not queryset.exists():  # empty queryset\n        return None\n    s, it, nots, record_text = (\n        (\"\", \"it\", \"s\", \"a record\")\n        if len(queryset) == 1\n        else (\"s\", \"one of them\", \"\", \"records\")\n    )\n    similar_names = \", \".join(f\"'{getattr(record, name_field)}'\" for record in queryset)\n    msg = f\"you are trying to create a record with name='{kwargs[name_field]}' but {record_text} with similar {name_field}{s} exist{nots}: {similar_names}. Did you mean to load {it}?\"\n    logger.warning(f\"{msg}\")\n\n    return None\n\n\ndef delete_record(record: BaseSQLRecord, is_soft: bool = True):\n    def delete():\n        if is_soft:\n            record.branch_id = -1\n            record.save()\n            return None\n        else:\n            return super(BaseSQLRecord, record).delete()\n\n    # deal with versioned records\n    # if _overwrite_versions = True, there is only a single version and\n    # no need to set the new latest version because all versions are deleted\n    # when deleting the latest version\n    if (\n        isinstance(record, IsVersioned)\n        and record.is_latest\n        and not getattr(record, \"_overwrite_versions\", False)\n    ):\n        promoted = _adjust_is_latest_when_deleting_is_versioned(record)\n        if promoted:\n            if is_soft:\n                record.is_latest = False\n            with transaction.atomic():\n                result = delete()\n            return result\n    # deal with all other cases of the nested if condition now\n    return delete()\n\n\nRECORD_REGISTRY_EXAMPLE = \"\"\"Example::\n\n        from lamindb import SQLRecord, fields\n\n        # sub-classing `SQLRecord` creates a new registry\n        class Experiment(SQLRecord):\n            name: str = fields.CharField()\n\n        # instantiating `Experiment` creates a record `experiment`\n        experiment = Experiment(name=\"my experiment\")\n\n        # you can save the record to the database\n        experiment.save()\n\n        # `Experiment` refers to the registry, which you can query\n        df = Experiment.filter(name__startswith=\"my \").to_dataframe()\n\"\"\"\n\n\ndef _synchronize_clone(storage_root: str) -> str | None:\n    \"\"\"Synchronizes a clone to the local SQLite path.\n\n    Args:\n        storage_root: The storage root path of the (target) instance\n    \"\"\"\n    cloud_db_path = UPath(storage_root) / \".lamindb\" / \"lamin.db\"\n    local_sqlite_path = ln_setup.settings.cache_dir / cloud_db_path.path.lstrip(\"/\")\n\n    local_sqlite_path.parent.mkdir(parents=True, exist_ok=True)\n    cloud_db_path_gz = UPath(str(cloud_db_path) + \".gz\", anon=True)\n    local_sqlite_path_gz = Path(str(local_sqlite_path) + \".gz\")\n\n    try:\n        if cloud_db_path_gz.synchronize_to(\n            local_sqlite_path_gz, error_no_origin=True, print_progress=True\n        ):\n            with (\n                gzip.open(local_sqlite_path_gz, \"rb\") as f_in,\n                open(local_sqlite_path, \"wb\") as f_out,\n            ):\n                shutil.copyfileobj(f_in, f_out)\n        return f\"sqlite:///{local_sqlite_path}\"\n    except (FileNotFoundError, PermissionError):\n        logger.debug(\"Clone not found. Falling back to normal access...\")\n        return None\n\n\n# this is the metaclass for SQLRecord\n@doc_args(RECORD_REGISTRY_EXAMPLE)\nclass Registry(ModelBase):\n    \"\"\"Metaclass for :class:`~lamindb.models.SQLRecord`.\n\n    Each `Registry` *object* is a `SQLRecord` *class* and corresponds to a table in the metadata SQL database.\n\n    You work with `Registry` objects whenever you use *class methods* of `SQLRecord`.\n\n    You call any subclass of `SQLRecord` a \"registry\" and their objects \"records\". A `SQLRecord` object corresponds to a row in the SQL table.\n\n    If you want to create a new registry, you sub-class `SQLRecord`.\n\n    {}\n\n    Note: `Registry` inherits from Django's `ModelBase`.\n    \"\"\"\n\n    _available_fields: set[str] = None\n\n    def __new__(cls, name, bases, attrs, **kwargs):\n        new_class = super().__new__(cls, name, bases, attrs, **kwargs)\n        return new_class\n\n    # below creates a sensible auto-complete behavior that differs across the\n    # class and instance level in Jupyter Editors it doesn't have any effect for\n    # static type analyzer like pylance used in VSCode\n    def __dir__(cls):\n        # this is needed to bring auto-complete on the class-level back\n        # https://laminlabs.slack.com/archives/C04FPE8V01W/p1717535625268849\n        # Filter class attributes, excluding instance methods\n        exclude_instance_methods = \"sphinx\" not in sys.modules\n        # https://laminlabs.slack.com/archives/C04FPE8V01W/p1721134595920959\n\n        def include_attribute(attr_name, attr_value):\n            if attr_name.startswith(\"__\"):\n                return False\n            if exclude_instance_methods and callable(attr_value):\n                return isinstance(attr_value, (classmethod, staticmethod, type))\n            return True\n\n        # check also inherited attributes\n        if hasattr(cls, \"mro\"):\n            attrs = chain(*(c.__dict__.items() for c in cls.mro()))\n        else:\n            attrs = cls.__dict__.items()\n\n        result = []\n        for attr_name, attr_value in attrs:\n            if attr_name not in result and include_attribute(attr_name, attr_value):\n                result.append(attr_name)\n\n        # Add non-dunder attributes from Registry\n        for attr in dir(Registry):\n            if not attr.startswith(\"__\") and attr not in result:\n                result.append(attr)\n        return result\n\n    def describe(cls, return_str: bool = False) -> str | None:\n        \"\"\"Describe the fields of the registry.\"\"\"\n        from ._describe import strip_ansi_from_string as _strip_ansi\n\n        repr_str = f\"{colors.green(cls.__name__)}\\n\"\n        info = SQLRecordInfo(cls)\n        repr_str += info.get_simple_fields(return_str=True)\n        repr_str += info.get_relational_fields(return_str=True)\n        repr_str = repr_str.rstrip(\"\\n\")\n        if return_str:\n            return _strip_ansi(repr_str)\n        else:\n            print(repr_str)\n            return None\n\n    @doc_args(_lookup.__doc__)\n    def lookup(\n        cls,\n        field: StrField | None = None,\n        return_field: StrField | None = None,\n        keep: Literal[\"first\", \"last\", False] = \"first\",\n    ) -> NamedTuple:\n        \"\"\"{}\"\"\"  # noqa: D415\n        return _lookup(cls=cls, field=field, return_field=return_field, keep=keep)\n\n    def filter(cls, *queries, **expressions) -> QuerySet:\n        \"\"\"Query records.\n\n        Args:\n            queries: One or multiple `Q` objects.\n            expressions: Fields and values passed as Django query expressions.\n\n        See Also:\n            - Guide: :doc:`docs:registries`\n            - Django documentation: `Queries <https://docs.djangoproject.com/en/stable/topics/db/queries/>`__\n\n        Examples:\n            >>> ln.Project(name=\"my label\").save()\n            >>> ln.Project.filter(name__startswith=\"my\").to_dataframe()\n        \"\"\"\n        from .query_set import QuerySet\n\n        _using_key = None\n        if \"_using_key\" in expressions:\n            _using_key = expressions.pop(\"_using_key\")\n\n        return QuerySet(model=cls, using=_using_key).filter(*queries, **expressions)\n\n    def get(\n        cls: type[T],\n        idlike: int | str | None = None,\n        **expressions,\n    ) -> T:\n        \"\"\"Get a single record.\n\n        Args:\n            idlike: Either a uid stub, uid or an integer id.\n            expressions: Fields and values passed as Django query expressions.\n\n        Raises:\n            :exc:`lamindb.errors.ObjectDoesNotExist`: In case no matching record is found.\n\n        See Also:\n            - Guide: :doc:`registries`\n            - Django documentation: `Queries <https://docs.djangoproject.com/en/stable/topics/db/queries/>`__\n\n        Examples:\n\n            ::\n\n                record = ln.Record.get(\"FvtpPJLJ\")\n                record = ln.Record.get(name=\"my-label\")\n        \"\"\"\n        from .query_set import QuerySet\n\n        return QuerySet(model=cls).get(idlike, **expressions)\n\n    def to_dataframe(\n        cls,\n        *,\n        include: str | list[str] | None = None,\n        features: str | list[str] | None = None,\n        limit: int | None = 100,\n        order_by: str | None = \"-id\",\n    ) -> pd.DataFrame:\n        \"\"\"Evaluate and convert to `pd.DataFrame`.\n\n        By default, this returns up to 100 rows for a fast overview.\n        Pass `limit=None` to fetch all matching records.\n\n        By default, maps simple fields and foreign keys onto `DataFrame` columns.\n\n        Guide: :doc:`docs:registries`\n\n        Args:\n            include: Related data to include as columns. Takes strings of\n                form `\"records__name\"`, `\"cell_types__name\"`, etc. or a list\n                of such strings. For `Artifact`, `Record`, and `Run`, can also pass `\"features\"`\n                to include features with data types pointing to entities in the core schema.\n                If `\"privates\"`, includes private fields (fields starting with `_`).\n            features: Configure the features to include. Can be a feature name or a list of such names.\n                If `\"queryset\"`, infers the features used within the current queryset.\n                Only available for `Artifact`, `Record`, and `Run`.\n            limit: Maximum number of rows to display. Defaults to 100. If `None`,\n                includes all results.\n            order_by: Field name to order the records by. Prefix with '-' for descending order.\n                Defaults to '-id' to get the most recent records. This argument is ignored\n                if the queryset is already ordered or if the specified field does not exist.\n\n        Examples:\n\n            Include the name of the creator::\n\n                ln.Record.to_dataframe(include=\"created_by__name\"])\n\n            Include features::\n\n                ln.Artifact.to_dataframe(include=\"features\")\n\n            Include selected features::\n\n                ln.Artifact.to_dataframe(features=[\"cell_type_by_expert\", \"cell_type_by_model\"])\n        \"\"\"\n        return cls.filter().to_dataframe(\n            include=include, features=features, order_by=order_by, limit=limit\n        )\n\n    @deprecated(new_name=\"to_dataframe\")\n    def df(\n        cls,\n        *,\n        include: str | list[str] | None = None,\n        features: str | list[str] | None = None,\n        limit: int | None = 100,\n        order_by: str | None = \"-id\",\n    ) -> pd.DataFrame:\n        return cls.to_dataframe(\n            include=include, features=features, limit=limit, order_by=order_by\n        )\n\n    @doc_args(_search.__doc__)\n    def search(\n        cls,\n        string: str,\n        *,\n        field: StrField | None = None,\n        limit: int | None = 20,\n        case_sensitive: bool = False,\n    ) -> QuerySet:\n        \"\"\"{}\"\"\"  # noqa: D415\n        return _search(\n            cls=cls,\n            string=string,\n            field=field,\n            limit=limit,\n            case_sensitive=case_sensitive,\n        )\n\n    @deprecated(new_name=\"connect\")\n    def using(\n        cls,\n        instance: str | None,\n    ) -> QuerySet:\n        return cls.connect(\n            instance=instance,\n        )\n\n    def connect(\n        cls,\n        instance: str | None,\n    ) -> QuerySet:\n        \"\"\"Query a non-default LaminDB instance.\n\n        Args:\n            instance: An instance identifier of form \"account_handle/instance_name\".\n\n        Examples:\n\n            ::\n\n                ln.Record.connect(\"account_handle/instance_name\").search(\"label7\", field=\"name\")\n        \"\"\"\n        from .query_set import QuerySet\n\n        # we're in the default instance\n        if instance is None or instance == \"default\":\n            return QuerySet(model=cls, using=None)\n        # connection already established\n        if instance in connections:\n            return QuerySet(model=cls, using=instance)\n\n        owner, name = get_owner_name_from_identifier(instance)\n        current_instance_owner_name: list[str] = setup_settings.instance.slug.split(\"/\")\n\n        # move on to different instances\n        cache_using_filepath = (\n            setup_settings.cache_dir / f\"instance--{owner}--{name}--uid.txt\"\n        )\n        settings_file = instance_settings_file(name, owner)\n        if not settings_file.exists():\n            result = connect_instance_hub(owner=owner, name=name)\n            if isinstance(result, str):\n                message = INSTANCE_NOT_FOUND_MESSAGE.format(\n                    owner=owner, name=name, hub_result=result\n                )\n                raise InstanceNotFoundError(message)\n            iresult, storage = result\n            # this can happen if querying via an old instance name\n            if [iresult.get(\"owner\"), iresult[\"name\"]] == current_instance_owner_name:\n                return QuerySet(model=cls, using=None)\n            # do not use {} syntax below, it gives rise to a dict if the schema modules\n            # are empty and then triggers a TypeError in missing_members = source_modules - target_modules\n            source_modules = set(  # noqa\n                [mod for mod in iresult[\"schema_str\"].split(\",\") if mod != \"\"]\n            )\n\n            # Try to connect to a clone if targeting a public instance but fall back to normal access if access failed\n            db = None\n            if (\n                \"_public\" in iresult[\"db_user_name\"]\n                and \"postgresql\" in iresult[\"db_scheme\"]\n            ):\n                db = _synchronize_clone(storage[\"root\"])\n            if db is None:\n                if [\n                    iresult.get(\"owner\"),\n                    iresult[\"name\"],\n                ] == current_instance_owner_name:\n                    return QuerySet(model=cls, using=None)\n                db = update_db_using_local(iresult, settings_file)\n                is_fine_grained_access = (\n                    iresult[\"fine_grained_access\"]\n                    and iresult[\"db_permissions\"] == \"jwt\"\n                )\n            else:\n                is_fine_grained_access = False\n\n            cache_using_filepath.write_text(\n                f\"{iresult['lnid']}\\n{iresult['schema_str']}\", encoding=\"utf-8\"\n            )\n\n            # access_db can take both: the dict from connect_instance_hub and isettings\n            into_db_token = iresult\n        else:\n            isettings = load_instance_settings(settings_file)\n            source_modules = isettings.modules\n            db = None\n            if \"public\" in isettings.db and isettings.dialect == \"postgresql\":\n                db = _synchronize_clone(isettings.storage.root_as_str)\n\n            # Try to connect to a clone if targeting a public instance but fall back to normal access if access failed\n            if db is None:\n                if [isettings.owner, isettings.name] == current_instance_owner_name:\n                    return QuerySet(model=cls, using=None)\n                db = isettings.db\n                is_fine_grained_access = (\n                    isettings._fine_grained_access\n                    and isettings._db_permissions == \"jwt\"\n                )\n            else:\n                is_fine_grained_access = False\n\n            cache_using_filepath.write_text(\n                f\"{isettings.uid}\\n{','.join(source_modules)}\", encoding=\"utf-8\"\n            )\n            # access_db can take both: the dict from connect_instance_hub and isettings\n            into_db_token = isettings\n\n        target_modules = setup_settings.instance.modules\n        if missing_members := source_modules - target_modules:\n            logger.info(\n                f\"in transfer, source lamindb instance has additional modules: {', '.join(missing_members)}\"\n            )\n\n        add_db_connection(db, instance)\n        if is_fine_grained_access:\n            db_token = DBToken(into_db_token)\n            db_token_manager.set(db_token, instance)\n\n        return QuerySet(model=cls, using=instance)\n\n    def __get_module_name__(cls) -> str:\n        schema_module_name = cls.__module__.split(\".\")[0]\n        module_name = schema_module_name.replace(\"lnschema_\", \"\")\n        if module_name == \"lamindb\":\n            module_name = \"core\"\n        return module_name\n\n    def __get_name_with_module__(cls) -> str:\n        module_name = cls.__get_module_name__()\n        if module_name == \"core\":\n            module_prefix = \"\"\n        else:\n            module_prefix = f\"{module_name}.\"\n        return f\"{module_prefix}{cls.__name__}\"\n\n    def __get_available_fields__(cls) -> set[str]:\n        if cls._available_fields is None:\n            available_fields = set()\n            for field in cls._meta.get_fields():\n                if not (field_name := field.name).startswith((\"_\", \"links_\")):\n                    available_fields.add(field_name)\n                    if isinstance(field, django_ForeignKey):\n                        available_fields.add(field_name + \"_id\")\n            if cls.__name__ == \"Artifact\":\n                available_fields.add(\"transform\")\n                available_fields.add(\"feature_sets\")  # backward compat with lamindb v1\n            cls._available_fields = available_fields\n        return cls._available_fields\n\n\nclass BaseSQLRecord(models.Model, metaclass=Registry):\n    \"\"\"Base SQL metadata record.\n\n    It provides methods to `SQLRecord` and all its subclasses,\n    but doesn't come with the additional `branch` and `space` fields.\n    \"\"\"\n\n    objects = QueryManager()\n\n    class Meta:\n        abstract = True\n        base_manager_name = \"objects\"\n\n    # fields to track for changes\n    # if not None, will be tracked in self._original_values as {field_name: value}\n    # use _id fields for foreign keys\n    _TRACK_FIELDS: tuple[str, ...] | None = None\n\n    def __init__(self, *args, **kwargs):\n        skip_validation = kwargs.pop(\"_skip_validation\", False)\n        if not args:\n            if not os.getenv(\"LAMINDB_MULTI_INSTANCE\") == \"true\":\n                if (\n                    issubclass(self.__class__, SQLRecord)\n                    and self.__class__.__name__ != \"Storage\"\n                    # do not save bionty entities in restricted spaces by default\n                    and self.__class__.__module__ != \"bionty.models\"\n                ):\n                    from lamindb import context as run_context\n\n                    if run_context.space is not None:\n                        current_space = run_context.space\n                    elif setup_settings.space is not None:\n                        current_space = setup_settings.space\n\n                    if current_space is not None:\n                        if \"space_id\" in kwargs:\n                            # space_id takes precedence over space\n                            # https://claude.ai/share/f045e5dc-0143-4bc5-b8a4-38309229f75e\n                            if kwargs[\"space_id\"] == 1:  # ignore default space\n                                kwargs.pop(\"space_id\")\n                                kwargs[\"space\"] = current_space\n                        elif \"space\" in kwargs:\n                            if kwargs[\"space\"] is None:\n                                kwargs[\"space\"] = current_space\n                        else:\n                            kwargs[\"space\"] = current_space\n                if _is_branch_sensitive_model(self.__class__):\n                    from lamindb import context as run_context\n\n                    if run_context.branch is not None:\n                        current_branch = run_context.branch\n                    elif setup_settings.branch is not None:\n                        current_branch = setup_settings.branch\n\n                    if current_branch is not None:\n                        # branch_id takes precedence over branch\n                        # https://claude.ai/share/f045e5dc-0143-4bc5-b8a4-38309229f75e\n                        if \"branch_id\" in kwargs:\n                            if kwargs[\"branch_id\"] == 1:  # ignore default branch\n                                kwargs.pop(\"branch_id\")\n                                kwargs[\"branch\"] = current_branch\n                        elif \"branch\" in kwargs:\n                            if kwargs[\"branch\"] is None:\n                                kwargs[\"branch\"] = current_branch\n                        else:\n                            kwargs[\"branch\"] = current_branch\n                        kwargs[\"created_on\"] = kwargs[\"branch\"]\n            if skip_validation:\n                super().__init__(**kwargs)\n            else:\n                from ..core._settings import settings\n                from .can_curate import CanCurate\n                from .collection import Collection\n                from .transform import Transform\n\n                validate_fields(self, kwargs)\n\n                # do not search for names if an id is passed; this is important\n                # e.g. when synching ids from the notebook store to lamindb\n                has_consciously_provided_uid = False\n                if \"_has_consciously_provided_uid\" in kwargs:\n                    has_consciously_provided_uid = kwargs.pop(\n                        \"_has_consciously_provided_uid\"\n                    )\n                if (\n                    isinstance(self, (CanCurate, Collection, Transform))\n                    and settings.creation.search_names\n                    and not has_consciously_provided_uid\n                ):\n                    name_field = getattr(self, \"_name_field\", \"name\")\n                    exact_match = suggest_records_with_similar_names(\n                        self, name_field, kwargs\n                    )\n                    if exact_match is not None:\n                        if \"version_tag\" in kwargs:\n                            if kwargs.get(\"version_tag\") is not None:\n                                version_comment = \" and version\"\n                                existing_record = self.__class__.filter(\n                                    **{\n                                        name_field: kwargs[name_field],\n                                        \"version_tag\": kwargs.get(\"version_tag\"),\n                                    }\n                                ).one_or_none()\n                            else:\n                                # for a versioned record, an exact name match is not a criterion\n                                # for retrieving a record in case `version` isn't passed -\n                                # we'd always pull out many records with exactly the same name\n                                existing_record = None\n                        else:\n                            version_comment = \"\"\n                            existing_record = exact_match\n                        if existing_record is not None:\n                            logger.important(\n                                f\"returning {self.__class__.__name__.lower()} with same\"\n                                f\" {name_field}{version_comment}: '{kwargs[name_field]}'\"\n                            )\n                            init_self_from_db(self, existing_record)\n                            update_attributes(self, kwargs)\n                            # track original values after replacing with the existing record\n                            self._populate_tracked_fields()\n                            return None\n                super().__init__(**kwargs)\n                if isinstance(self, ValidateFields):\n                    # this will trigger validation against django validators\n                    try:\n                        if hasattr(self, \"clean_fields\"):\n                            self.clean_fields()\n                        else:\n                            self._Model__clean_fields()\n                    except DjangoValidationError as e:\n                        message = _format_django_validation_error(self, e)\n                        raise FieldValidationError(message) from e\n        elif len(args) != len(self._meta.concrete_fields):\n            raise FieldValidationError(\n                f\"Use keyword arguments instead of positional arguments, e.g.: {self.__class__.__name__}(name='...').\"\n            )\n        else:\n            super().__init__(*args)\n        # track original values of fields that are tracked for changes\n        self._populate_tracked_fields()\n        # TODO: refactor to use _TRACK_FIELDS\n        track_current_name_value(self)\n\n    # used in __init__\n    # populates the _original_values dictionary with the original values of the tracked fields\n    def _populate_tracked_fields(self):\n        if (track_fields := self._TRACK_FIELDS) is not None:\n            concrete_attnames = {f.attname for f in self._meta.concrete_fields}\n            self._original_values = {}\n            for field_name in track_fields:\n                if field_name not in concrete_attnames:\n                    raise FieldValidationError(\n                        f\"_TRACK_FIELDS contains invalid field for {self.__class__.__name__}: {field_name}\"\n                    )\n                # deferred model loading (e.g. .only(\"id\") or certain fetching methods during deletion)\n                # can omit tracked fields from __dict__;\n                # use .get(..., DEFERRED) to avoid KeyError and to show that the field is not loaded yet.\n                self._original_values[field_name] = self.__dict__.get(\n                    field_name, DEFERRED\n                )\n        else:\n            self._original_values = None\n\n    def _field_changed(self, field_name: str, check_is_saved: bool = True) -> bool:\n        \"\"\"Check if the field has changed since the record was saved.\"\"\"\n        # use _id fields for foreign keys in field_name\n        if check_is_saved and self._state.adding:\n            return False\n        # check if the field is tracked for changes\n        track_fields = self._TRACK_FIELDS\n        assert track_fields is not None, (\n            \"_TRACK_FIELDS must be set for the record to track changes\"\n        )\n        assert field_name in track_fields, (\n            f\"Field {field_name} is not tracked for changes\"\n        )\n        # check if the field has changed since the record was created\n        original_value = self._original_values.get(field_name, DEFERRED)\n        if original_value is DEFERRED:\n            return False\n        current_value = self.__dict__.get(field_name, DEFERRED)\n        if current_value is DEFERRED:\n            return False\n        return original_value != current_value\n\n    def save(self: T, *args, **kwargs) -> T:\n        \"\"\"Save.\n\n        Always saves to the default database.\n        \"\"\"\n        using_key = None\n        if \"using\" in kwargs:\n            using_key = kwargs[\"using\"]\n        transfer_config = kwargs.pop(\"transfer\", None)\n        db = self._state.db\n        pk_on_db = self.pk\n        artifacts: list = []\n        if self.__class__.__name__ == \"Collection\" and self.id is not None:\n            # when creating a new collection without being able to access artifacts\n            artifacts = self.ordered_artifacts.to_list()\n        pre_existing_record = None\n        # consider records that are being transferred from other databases\n        transfer_logs: dict[str, list[str]] = {\n            \"mapped\": [],\n            \"transferred\": [],\n            \"run\": None,\n        }\n        if db is not None and db != \"default\" and using_key is None:\n            if isinstance(self, IsVersioned):\n                if not self.is_latest:\n                    raise NotImplementedError(\n                        \"You are attempting to transfer a record that's not the latest in its version history. This is currently not supported.\"\n                    )\n            pre_existing_record = transfer_to_default_db(\n                self, using_key, transfer_logs=transfer_logs\n            )\n        self._revises: IsVersioned\n        if pre_existing_record is not None:\n            init_self_from_db(self, pre_existing_record)\n        else:\n            # TODO: refactor to use _TRACK_FIELDS\n            check_name_change(self)\n            try:\n                # save versioned record in presence of self._revises\n                if isinstance(self, IsVersioned) and self._revises is not None:\n                    revises = self._revises\n                    with transaction.atomic():\n                        # For branch-aware models (SQLRecord), keep source-branch latest\n                        # intact and only demote within the same branch. For other\n                        # versioned models (e.g. blocks), keep previous behavior.\n                        should_demote = True\n                        if hasattr(revises, \"branch_id\") and hasattr(self, \"branch_id\"):\n                            should_demote = revises.branch_id == self.branch_id\n                        if should_demote:\n                            assert revises.is_latest  # noqa: S101\n                            revises.is_latest = False\n                            revises._revises = None  # ensure we don't start a recursion\n                            revises.save()\n                        super().save(*args, **kwargs)  # type: ignore\n                    self._revises = None\n                # save unversioned record\n                else:\n                    super().save(*args, **kwargs)\n            except (IntegrityError, ProgrammingError) as e:\n                error_msg = str(e)\n                # error for hash/uid duplication\n                if (\n                    self.__class__.__name__ in {\"Transform\", \"Artifact\", \"Collection\"}\n                    and isinstance(e, IntegrityError)\n                    and \"hash\" in error_msg\n                    and unique_constraint_error_in_error_message(error_msg)\n                ):\n                    # we also need to include the key here because hash can be the same across keys\n                    query_fields = {\"hash\": self.hash, \"key\": self.key}\n                    if self.__class__.__name__ == \"Artifact\":\n                        # in case of artifact, also storage is needed\n                        query_fields[\"storage\"] = self.storage\n                    # the get here is Django's get and not aware of the trash or other branches\n                    # but generally we bypass branch_id in queries for hash also in LaminDB's get()\n                    pre_existing_record = self.__class__.get(**query_fields)\n                    from_trash = (\n                        \"from trash\" if pre_existing_record.branch_id == -1 else \"\"\n                    )\n                    pre_existing_record.branch_id = 1  # move to default branch\n                    logger.warning(\n                        f\"returning {self.__class__.__name__.lower()} {from_trash} with same hash & key: {pre_existing_record}\"\n                    )\n                    init_self_from_db(self, pre_existing_record)\n                elif (\n                    isinstance(e, IntegrityError)\n                    # for Storage, even if uid was in the error message, we can retrieve based on\n                    # the root because it's going to be the same root\n                    and any(field in error_msg for field in UNIQUE_FIELD_NAMES)\n                    and (\n                        \"_type_name_at_\" not in error_msg\n                    )  # constraints for unique type names in Record, ULabel, etc.\n                    and (\n                        \"UNIQUE constraint failed\" in error_msg\n                        or \"duplicate key value violates unique constraint\" in error_msg\n                    )\n                    and hasattr(self, \"branch_id\")\n                ):\n                    unique_fields = parse_violated_field_from_error_message(error_msg)\n                    # here we query against the all branches with .objects\n                    pre_existing_record = self.__class__.objects.get(\n                        **{field: getattr(self, field) for field in unique_fields}\n                    )\n                    # if the existing record is in the default branch, we just return it\n                    if pre_existing_record.branch_id == 1:\n                        logger.warning(\n                            f\"returning {self.__class__.__name__} record with same {unique_fields}: '{ {field: getattr(self, field) for field in unique_fields} }'\"\n                        )\n                    # if the existing record is in a different branch we update its fields\n                    else:\n                        # modifies the fields of the existing record with new values of self\n                        field_names = [i.name for i in self.__class__._meta.fields]\n                        update_attributes(\n                            pre_existing_record,\n                            {f: getattr(self, f) for f in field_names},\n                        )\n                        pre_existing_record.save()\n                    init_self_from_db(self, pre_existing_record)\n                elif (\n                    isinstance(e, ProgrammingError)\n                    and \"new row violates row-level security policy\" in error_msg\n                    and (\n                        (is_locked := getattr(self, \"is_locked\", False))\n                        or hasattr(self, \"space\")\n                    )\n                ):\n                    if is_locked:\n                        no_write_msg = \"It is not allowed to modify or create locked ('is_locked=True') records.\"\n                    else:\n                        no_write_msg = (\n                            f\"You're not allowed to write to the space '{self.space.name}'.\\n\"\n                            \"Please contact administrators of the space if you need write access.\"\n                        )\n                    raise NoWriteAccess(no_write_msg) from None\n                elif (\n                    isinstance(e, ProgrammingError)\n                    and \"permission denied for table\" in error_msg\n                    and (isettings := setup_settings.instance)._db_permissions\n                    == \"public\"\n                ):\n                    slug = isettings.slug\n                    raise NoWriteAccess(\n                        f\"You are trying to write to '{slug}' with public (read-only) permissions.\\n\"\n                        \"Please contact administrators to make you a collaborator if you need write access.\\n\"\n                        f\"If you are already a collaborator, please do 'lamin connect {slug}' in console, \"\n                        \"restart the python session and try again.\"\n                    ) from None\n                else:\n                    raise\n            # call the below in case a user makes more updates to the record\n            track_current_name_value(self)\n        # perform transfer of many-to-many fields\n        # only supported for Artifact and Collection records\n        if db is not None and db != \"default\" and using_key is None:\n            if self.__class__.__name__ == \"Collection\":\n                if len(artifacts) > 0:\n                    logger.info(\"transfer artifacts\")\n                    for artifact in artifacts:\n                        artifact.save()\n                    self.artifacts.add(*artifacts)\n            if hasattr(self, \"labels\") and transfer_config == \"annotations\":\n                from copy import copy\n\n                # here we go back to original record on the source database\n                self_on_db = copy(self)\n                self_on_db._state.db = db\n                self_on_db.pk = pk_on_db  # manually set the primary key\n                self.features._add_from(self_on_db, transfer_logs=transfer_logs)\n                self.labels.add_from(self_on_db, transfer_logs=transfer_logs)\n            for k, v in transfer_logs.items():\n                if k != \"run\" and len(v) > 0:\n                    logger.important(f\"{k}: {', '.join(v)}\")\n\n        if self.__class__.__name__ in {\n            \"Artifact\",\n            \"Transform\",\n            \"Run\",\n            \"ULabel\",\n            \"Feature\",\n            \"Schema\",\n            \"Collection\",\n            \"Reference\",\n        } and not (\n            self.__class__.__name__ == \"Artifact\" and self.kind == \"__lamindb_run__\"\n        ):\n            import lamindb as ln\n\n            if ln.context.project is not None:\n                self.projects.add(ln.context.project)\n        return self\n\n    @class_and_instance_method\n    def describe(\n        cls_or_self,\n        return_str: bool = False,\n        include: None | Literal[\"comments\"] = None,\n    ) -> None | str:\n        \"\"\"Describe record including relations.\n\n        Args:\n            return_str: Return a string instead of printing.\n            include: Include additional content. Use ``\"comments\"`` to display\n                readme and comment blocks.\n        \"\"\"\n        from ._describe import describe_postgres_sqlite\n\n        if isinstance(cls_or_self, type):\n            return type(cls_or_self).describe(cls_or_self, return_str=return_str)  # type: ignore\n        else:\n            return describe_postgres_sqlite(\n                cls_or_self, return_str=return_str, include=include\n            )\n\n    def __repr__(\n        self: SQLRecord,\n        include_foreign_keys: bool = True,\n        exclude_field_names: list[str] | None = None,\n    ) -> str:\n        if exclude_field_names is None:\n            exclude_field_names = [\"id\", \"updated_at\", \"source_code\"]\n        field_names = [\n            field.name\n            for field in self._meta.fields\n            if (\n                not isinstance(field, ForeignKey)\n                and field.name not in exclude_field_names\n            )\n        ]\n        if include_foreign_keys:\n            field_names += [\n                f\"{field.name}_id\"\n                for field in self._meta.fields\n                if isinstance(field, ForeignKey)\n            ]\n        # TODO: harmonize with L426 in query_set.py\n        if \"created_at\" in field_names:\n            field_names.remove(\"created_at\")\n            field_names.append(\"created_at\")\n        if \"is_locked\" in field_names:\n            field_names.remove(\"is_locked\")\n            field_names.append(\"is_locked\")\n        if \"created_on\" in field_names:\n            field_names.remove(\"created_on\")\n            field_names.append(\"created_on\")\n        if \"version_tag\" in field_names:\n            field_names.remove(\"version_tag\")\n            field_names.append(\"version_tag\")\n        if \"is_latest\" in field_names:\n            field_names.remove(\"is_latest\")\n            field_names.append(\"is_latest\")\n        if field_names[0] != \"uid\" and \"uid\" in field_names:\n            field_names.remove(\"uid\")\n            field_names.insert(0, \"uid\")\n        fields_str = {}\n        for k in field_names:\n            if k == \"n\" and getattr(self, k) < 0:\n                # only needed for Schema\n                continue\n            if (\n                not k.startswith(\"_\")\n                or (k == \"_dtype_str\" and self.__class__.__name__ == \"Feature\")\n            ) and hasattr(self, k):\n                value = getattr(self, k)\n                # Force strip the time component of the version\n                if k == \"version\" and value:\n                    fields_str[k] = f\"'{str(value).split()[0]}'\"\n                else:\n                    fields_str[k] = format_field_value(value)\n        fields_joined_str = \", \".join(\n            [f\"{k}={fields_str[k]}\" for k in fields_str if fields_str[k] is not None]\n        )\n        return f\"{self.__class__.__name__}({fields_joined_str})\"\n\n    def __str__(self) -> str:\n        return self.__repr__()\n\n    def delete(self, permanent: bool | None = None):\n        \"\"\"Delete.\n\n        Args:\n            permanent: For consistency, `False` raises an error, as soft delete is impossible.\n\n        Returns:\n            When `permanent=True`, returns Django's delete return value: a tuple of\n            (deleted_count, {registry_name: count}). Otherwise returns None.\n        \"\"\"\n        if permanent is False:\n            raise ValueError(\n                f\"Soft delete is not possible for {self.__class__.__name__}, \"\n                \"use 'permanent=True' or 'permanent=None' for permanent deletion.\"\n            )\n\n        return delete_record(self, is_soft=False)\n\n\nclass Space(BaseSQLRecord):\n    \"\"\"Spaces with managed access for specific users or teams.\n\n    If not setting a space, a :class:`~lamindb.models.SQLRecord` object is accessible to all collaborators of the LaminDB instance because its :attr:`~lamindb.models.SQLRecord.space` field defaults to the built-in `all` space.\n    You can create a restricted space through LaminHub either on the instance settings page or the *Spaces* tab of your account page.\n\n    Examples:\n\n        After creating a restricted space through LaminHub, create an artifact in the space::\n\n            space = ln.Space.get(name=\"Our space\")  # get a space\n            ln.Artifact(\"./test.txt\", key=\"test.txt\", space=space).save()  # save artifact in space\n\n        You can also move an existing object into a space::\n\n            space = ln.Space.get(name=\"Our space\")  # select a space\n            record = ln.Record.get(name=\"existing label\")\n            record.space = space\n            record.save()  # saved in space \"Our space\"\n\n        For more examples and background, see :doc:`docs:permissions`, in particular, section :ref:`docs:use-a-restricted-space`.\n\n    Notes:\n\n        All data in this registry is synchronized from LaminHub so that spaces can be shared and reused across multiple LaminDB instances.\n    \"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n        constraints = [\n            models.UniqueConstraint(Lower(\"name\"), name=\"unique_space_name_lower\")\n        ]\n\n    id: int = models.SmallAutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    name: str = models.CharField(max_length=100, db_index=True)\n    \"\"\"Name of space.\"\"\"\n    uid: str = CharField(\n        editable=False,\n        unique=True,\n        max_length=12,\n        default=base62_12,\n        db_index=True,\n    )\n    \"\"\"Universal id.\"\"\"\n    description: str | None = TextField(null=True)\n    \"\"\"Description of space.\"\"\"\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of creation of record.\"\"\"\n    created_by: User = ForeignKey(\n        \"User\", CASCADE, default=None, related_name=\"+\", null=True\n    )\n    \"\"\"Creator of space.\"\"\"\n    ablocks: RelatedManager[SpaceBlock]\n    \"\"\"Attached blocks ← :attr:`~lamindb.SpaceBlock.space`.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        name: str,\n        description: str | None = None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        if not args and \"uid\" not in kwargs:\n            warn = False\n            msg = \"\"\n            isettings = setup_settings.instance\n            if (dialect := isettings.dialect) != \"postgresql\":\n                warn = True\n                msg = f\"on {dialect} databases\"\n            elif not isettings.is_on_hub:\n                warn = True\n                msg = \"on local instances\"\n            if warn:\n                logger.warning(\n                    f\"creating spaces manually {msg} is possible for demo purposes, \"\n                    \"but does *not* affect access permissions\"\n                )\n        super().__init__(*args, **kwargs)\n\n\nclass Branch(BaseSQLRecord):\n    \"\"\"Branches for change management with archive and trash states.\n\n    .. dropdown:: The 3 built-in branches: `main`, `trash` & `archive`\n\n        The `main` branch acts as the default branch.\n\n        The `trash` branch acts like a trash bin on a file system.\n        It you delete a `SQLRecord` object via `.delete()`, it gets moved onto the `trash` branch and scheduled for deletion.\n\n        The `archive` acts like an archive that hides objects from queries and searches without scheduling them for deletion.\n        To move an object into the archive, run: `obj.branch_id = 0; obj.save()`.\n\n    Args:\n        name: A unique name. When lower-cased, is constrained to be unique across all branches.\n        description: A description.\n\n    Examples:\n\n        To create a contribution branch and switch to it, run::\n\n            lamin switch -c my_branch\n\n        To merge a contribution branch into `main`, run::\n\n            lamin switch main  # switch to the main branch\n            lamin merge my_branch  # merge contribution branch into main\n\n        To see the current branch along with other information, run::\n\n            lamin info\n\n        To annotate the current branch with a `README.md`, run::\n\n            lamin annotate branch --readme README.md\n\n        To comment on the current branch, run::\n\n            lamin annotate branch --comment \"I think we should revisit this, tomorrow, WDYT?\"\n\n        To describe the current branch (optionally include comments), run::\n\n            lamin describe branch --include comments\n\n        To trace on which branch a `SQLRecord` object was created, run::\n\n            sqlrecord.created_on.describe()\n\n        To open a Change Request for a branch, run:\n\n        .. tab-set::\n\n            .. tab-item:: CLI\n\n                .. code-block:: bash\n\n                    lamin update branch --status draft  # for current branch\n                    lamin update branch --name my_branch --status review  # for any branch\n\n            .. tab-item:: Python\n\n                .. code-block:: python\n\n                    branch = ln.Branch.get(name=\"my_branch\")\n                    branch.status = \"draft\"\n                    branch.save()\n\n                    branch.status = \"review\"\n                    branch.save()\n\n        Just like Pull Requests on GitHub, branches are never deleted\n        so that the provenance of a change stays traceable.\n\n    .. dropdown:: Managing `is_latest` during branching\n\n        `is_latest` is branch-aware during development and reconciled on merge.\n\n        - Creating a new version on a contribution branch keeps the previous\n          version on `main` as `is_latest=True`.\n        - After `lamin merge`, only one object per version family remains\n          with `is_latest=True` in the target branch.\n        - If both source and target branches have `is_latest=True`, the merged\n          branch keeps the newest object by `created_at`.\n\n        Example flow::\n\n            # before merge\n            # main: v1.is_latest=True\n            # contribution branch: v2(revises=v1).is_latest=True\n            lamin switch main\n            lamin merge my_branch\n            # after merge on main: v2.is_latest=True, v1.is_latest=False\n\n    .. dropdown:: Logical vs. physical branching\n\n        LaminDB uses **logical branching** via `SQLRecord`'s `.branch` field, treating `branch` like any other field during queries & tracing,\n        and keeping infrastructure simple and platform-agnostic.\n        However, it doesn't allow isolating SQL `UPDATE` statements on a branch (only their corresponding `DbWrite` events).\n        Here are some notable alternatives:\n\n        - Some Postgres platforms like Supabase or Neon, by contrast, provide physical branching through cloning entire databases.\n          This allows for isolated SQL `UPDATE` statements but creates separate, disconnected environments and much overhead.\n        - Project Nessie is a versioned catalog for data lakes that tracks file states.\n          LaminDB is analogous to Nessie in that it also treats branching on the metadata catalog level\n          (considering LaminDB's SQL database as the metadata catalog).\n        - Dolt is a specialized database engine that provides storage-level branching.\n          It allows branch isolation and merging at the engine level.\n          While powerful, it requires using the Dolt database itself.\n\n        Why logical branching? Data science and ML workflows are primarily append-only.\n        Because a \"change\" usually results in a new version of an artifact, transform, or collection or new runs or other new objects rather than an in-place modification,\n        the row-level `branch` field provides isolation for 99% of use cases.\n        This avoids the technical complexity of row duplication, preserves database integrity, and allows the `is_latest` logic to reconcile versions globally upon merge.\n\n    \"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n        constraints = [\n            models.UniqueConstraint(Lower(\"name\"), name=\"unique_branch_name_lower\")\n        ]\n\n    # below isn't fully implemented but a roadmap\n    # - 3: template (hidden in queries & searches)\n    # - 2: locked (same as default, but locked for edits except for space admins)\n    # - 1: default (visible in queries & searches)\n    # - 0: archive (hidden, meant to be kept, locked for edits for everyone)\n    # - -1: trash (hidden, scheduled for deletion)\n\n    # An integer higher than >3 codes a branch that can be used for collaborators to create drafts\n    # that can be merged onto the main branch in an experience akin to a Pull Request. The mapping\n    # onto a semantic branch name is handled through LaminHub.\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"An integer id that's synchronized for a family of coupled database instances.\n\n    Among all LaminDB instances, this id is arbitrary and non-unique.\n    \"\"\"\n    name: str = models.CharField(max_length=100, db_index=True)\n    \"\"\"Name of branch.\"\"\"\n    uid: str = CharField(\n        editable=False,\n        unique=True,\n        max_length=12,\n        default=base62_12,\n        db_index=True,\n    )\n    \"\"\"Universal id.\n\n    This id is useful if one wants to apply the same patch to many database instances.\n    \"\"\"\n    space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1, related_name=\"+\")\n    \"\"\"The space associated with the branch.\"\"\"\n    description: str | None = TextField(null=True)\n    \"\"\"Description of branch.\"\"\"\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of creation of record.\"\"\"\n    created_by: User = ForeignKey(\n        \"User\", PROTECT, default=current_user_id, related_name=\"+\"\n    )\n    \"\"\"Creator of branch.\"\"\"\n    _status_code: int = models.SmallIntegerField(default=0, db_default=0, db_index=True)\n    \"\"\"Status code. -2: closed; -1: merged; 0: standalone; 1: draft; 2: review.\"\"\"\n    _aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True)\n    \"\"\"Auxiliary field for dictionary-like metadata.\"\"\"\n    ablocks: RelatedManager[BranchBlock]\n    \"\"\"Attached blocks ← :attr:`~lamindb.BranchBlock.branch`.\"\"\"\n    users: RelatedManager[User] = models.ManyToManyField(\n        \"User\",\n        through=\"BranchUser\",\n        related_name=\"branches\",\n    )\n    \"\"\"Users linked to this branch (e.g. reviewers) ← :attr:`~lamindb.User.branches`.\"\"\"\n    ulabels: RelatedManager[ULabel] = models.ManyToManyField(\n        \"ULabel\",\n        through=\"BranchULabel\",\n        related_name=\"branches\",\n    )\n    \"\"\"ULabels annotating this branch ← :attr:`~lamindb.BranchULabel.ulabel`.\"\"\"\n    projects: RelatedManager[Project] = models.ManyToManyField(\n        \"Project\",\n        through=\"BranchProject\",\n        related_name=\"branches\",\n    )\n    \"\"\"Projects annotating this branch ← :attr:`~lamindb.BranchProject.project`.\"\"\"\n\n    @property\n    def status(self) -> BranchStatus:\n        \"\"\"Branch status.\n\n        Get and set the status of the branch.\n\n        =============  =====  ==================================================\n        status         code   description\n        =============  =====  ==================================================\n        `closed`       -2     Change Request was closed without merging.\n        `merged`       -1     The branch was merged into another branch.\n        `standalone`   0      A standalone branch without Change Request.\n        `draft`        1      Change Request exists but is not ready for review.\n        `review`       2      Change Request is ready for review.\n        =============  =====  ==================================================\n\n        The database stores the branch status as an integer code in field `_status_code`.\n\n        Example:\n\n            See the status of a branch::\n\n                branch.status\n                #> 'standalone'\n\n            Open a Change Request in draft state::\n\n                branch.status = \"draft\"\n                branch.save()\n\n            Request review for the Change Request::\n\n                branch.status = \"review\"\n                branch.save()\n\n            Query by status::\n\n                ln.Branch.filter(status=\"merged\").to_dataframe()\n        \"\"\"\n        return BRANCH_CODE_TO_STATUS.get(self._status_code, \"standalone\")\n\n    @status.setter\n    def status(self, value: BranchStatus) -> None:\n        if value not in BRANCH_STATUS_TO_CODE:\n            raise ValueError(\n                \"Invalid branch status. Expected one of: \"\n                \"'standalone', 'draft', 'review', 'merged', 'closed'.\"\n            )\n        self._status_code = BRANCH_STATUS_TO_CODE[value]\n\n    @overload\n    def __init__(\n        self,\n        name: str,\n        description: str | None = None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        super().__init__(*args, **kwargs)\n\n\nclass BranchUser(BaseSQLRecord, IsLink):\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"branch\", \"user\", \"role\")\n\n    id: int = models.BigAutoField(primary_key=True)\n    branch: Branch = ForeignKey(Branch, CASCADE, related_name=\"links_user\")\n    user: User = ForeignKey(\"User\", PROTECT, related_name=\"links_branch\")\n    role: str = CharField(max_length=32, db_index=True)\n\n\n@doc_args(RECORD_REGISTRY_EXAMPLE)\nclass SQLRecord(BaseSQLRecord, metaclass=Registry):\n    \"\"\"An object that maps to a row in a SQL table in the database.\n\n    For the inherited `SQLRecord` class method definitions, see :class:`~lamindb.models.BaseSQLRecord`.\n\n    Every `SQLRecord` is a data model that comes with a registry in form of a SQL table in your database.\n\n    Sub-classing `SQLRecord` creates a new registry while instantiating a `SQLRecord` creates a new object.\n\n    {}\n\n    `SQLRecord`'s metaclass is :class:`~lamindb.models.Registry`.\n\n    `SQLRecord` inherits from Django's `Model` class.\n    Why does LaminDB call it `SQLRecord` and not `Model`?\n    The term `SQLRecord` can't lead to confusion with statistical, machine learning or biological models.\n    \"\"\"\n\n    # we need the db_default when not interacting via django directly on a required field\n    branch: Branch = ForeignKey(\n        Branch,\n        PROTECT,\n        default=1,\n        db_default=1,\n        related_name=\"+\",\n    )\n    \"\"\"The current branch of the object - changes e.g. on merge events.\"\"\"\n    created_on: Branch = ForeignKey(\n        Branch,\n        PROTECT,\n        default=1,\n        db_default=1,\n        related_name=\"+\",\n    )\n    \"\"\"The branch on which this object was created - never changes.\"\"\"\n    space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1, related_name=\"+\")\n    \"\"\"The space.\"\"\"\n    is_locked: bool = BooleanField(default=False, db_default=False)\n    \"\"\"Whether the object is locked for edits.\"\"\"\n    _aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True)\n    \"\"\"Auxiliary field for dictionary-like metadata.\"\"\"\n\n    class Meta:\n        abstract = True\n\n    def restore(self) -> None:\n        \"\"\"Restore from trash onto the main branch.\n\n        Does **not** restore descendant objects if the object is `HasType` with `is_type = True`.\n        \"\"\"\n        self.branch_id = 1\n        self.save()\n\n    def delete(self, permanent: bool | None = None, **kwargs):\n        \"\"\"Delete object.\n\n        If object is `HasType` with `is_type = True`, deletes all descendant objects, too.\n\n        Args:\n            permanent: Whether to permanently delete the object (skips trash).\n                If `None`, performs soft delete if the object is not already in the trash.\n\n        Returns:\n            When `permanent=True`, returns Django's delete return value: a tuple of\n            (deleted_count, {registry_name: count}). Otherwise returns None.\n\n        Examples:\n\n            For any `SQLRecord` object `sqlrecord`, call::\n\n                sqlrecord.delete()\n        \"\"\"\n        if self._state.adding:\n            logger.warning(\"record is not yet saved, delete has no effect\")\n            return None\n        name_with_module = self.__class__.__get_name_with_module__()\n\n        if name_with_module == \"Artifact\":\n            # this first check means an invalid delete fails fast rather than cascading through\n            # database and storage permission errors\n            isettings = setup_settings.instance\n            if self.storage.instance_uid != isettings.uid and (\n                kwargs[\"storage\"] or kwargs[\"storage\"] is None\n            ):\n                from ..errors import IntegrityError\n                from .storage import Storage\n\n                raise IntegrityError(\n                    \"Cannot simply delete artifacts outside of this instance's managed storage locations.\"\n                    \"\\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`\"\n                    f\"\\n(2) If you want to delete the artifact in storage, please connect to the writing lamindb instance (uid={self.storage.instance_uid}).\"\n                    f\"\\nThese are all managed storage locations of this instance:\\n{Storage.filter(instance_uid=isettings.uid).to_dataframe()}\"\n                )\n\n        # change branch_id to trash\n        trash_branch_id = -1\n        if self.branch_id > trash_branch_id and permanent is not True:\n            if isinstance(self, HasType) and self.is_type:\n                for child in getattr(\n                    self, f\"query_{self.__class__.__name__.lower()}s\"\n                )():\n                    child.delete()\n            delete_record(self, is_soft=True)\n            logger.important(f\"moved record to trash: {self}\")\n            return None\n\n        # permanent delete\n        if permanent is None:\n            object_type_name = self.__class__.__name__\n            log_identifier = self.uid if hasattr(self, \"uid\") else self.pk\n            response = input(\n                f\"{object_type_name} {log_identifier} is already in trash! Are you sure you want to delete it from your\"\n                \" database? You can't undo this action. (y/n) \"\n            )\n            confirm_delete = response == \"y\"\n        else:\n            confirm_delete = permanent\n\n        if confirm_delete:\n            if name_with_module == \"Run\":\n                from .run import _permanent_delete_runs\n\n                _permanent_delete_runs(self)\n                return None\n            if name_with_module == \"Transform\":\n                from .transform import _permanent_delete_transforms\n\n                _permanent_delete_transforms(self)\n                return None\n            if name_with_module == \"Artifact\":\n                from .artifact import delete_permanently\n\n                delete_permanently(\n                    self, storage=kwargs[\"storage\"], using_key=kwargs[\"using_key\"]\n                )\n                return None\n            return super().delete()\n        return None\n\n\ndef _format_django_validation_error(record: SQLRecord, e: DjangoValidationError):\n    \"\"\"Pretty print Django validation errors.\"\"\"\n    errors = {}\n    if hasattr(e, \"error_dict\"):\n        error_dict = e.error_dict\n    else:\n        error_dict = {\"__all__\": e.error_list}\n\n    for field_name, error_list in error_dict.items():\n        for error in error_list:\n            if hasattr(error, \"message\"):\n                msg = error.message\n            else:\n                msg = str(error)\n\n            if field_name == \"__all__\":\n                errors[field_name] = f\"{colors.yellow(msg)}\"\n            else:\n                current_value = getattr(record, field_name, None)\n                errors[field_name] = (\n                    f\"{field_name}: {colors.yellow(current_value)} is not valid\\n    → {msg}\"\n                )\n\n    if errors:\n        message = \"\\n  \"\n        for _, error in errors.items():\n            message += error + \"\\n  \"\n\n        return message\n\n\ndef _get_record_kwargs(record_class) -> list[tuple[str, str]]:\n    \"\"\"Gets the parameters of a SQLRecord from the overloaded signature.\n\n    Example:\n        >>> get_record_params(bt.Organism)\n        >>> [('name', 'str'), ('taxon_id', 'str | None'), ('scientific_name', 'str | None')]\n    \"\"\"\n    source = inspect.getsource(record_class)\n\n    # Find first overload that's not *db_args\n    pattern = r\"@overload\\s+def __init__\\s*\\(([\\s\\S]*?)\\):\\s*\\.{3}\"\n    overloads = re.finditer(pattern, source)\n\n    for single_overload in overloads:\n        params_block = single_overload.group(1)\n        # This is an additional safety measure if the overloaded signature that we're\n        # looking for is not at the top but a \"db_args\" constructor\n        if \"*db_args\" in params_block:\n            continue\n\n        params = []\n        for line in params_block.split(\"\\n\"):\n            line = line.strip()\n            if not line or \"self\" in line:\n                continue\n\n            # Extract name and type annotation\n            # The regex pattern finds parameter definitions like:\n            # Simple: name: str\n            # With default: age: int = 0\n            # With complex types: items: List[str] = []\n            param_pattern = (\n                r\"(\\w+)\"  # Parameter name\n                r\"\\s*:\\s*\"  # Colon with optional whitespace\n                r\"((?:[^=,]|\"  # Type hint: either non-equals/comma chars\n                r\"(?<=\\[)[^[\\]]*\"  # or contents within square brackets\n                r\"(?=\\]))+)\"  # looking ahead for closing bracket\n                r\"(?:\\s*=\\s*\"  # Optional default value part\n                r\"([^,]+))?\"  # Default value: anything but comma\n            )\n            match = re.match(param_pattern, line)\n            if not match:\n                continue\n\n            name, type_str = match.group(1), match.group(2).strip()\n\n            # Keep type as string instead of evaluating\n            params.append((name, type_str))\n\n        return params\n\n    return []\n\n\ndef get_name_field(\n    registry: type[SQLRecord] | QuerySet | Manager,\n    *,\n    field: StrField | None = None,\n) -> str:\n    \"\"\"Get the 1st char or text field from the registry.\"\"\"\n    if isinstance(registry, (QuerySet, Manager)):\n        registry = registry.model\n    model_field_names = [i.name for i in registry._meta.fields]\n\n    # set to default name field\n    if field is None:\n        if hasattr(registry, \"_name_field\"):\n            field = registry._meta.get_field(registry._name_field)\n        elif \"name\" in model_field_names:\n            field = registry._meta.get_field(\"name\")\n        else:\n            # first char or text field that doesn't contain \"id\"\n            for i in registry._meta.fields:\n                if \"id\" in i.name:\n                    continue\n                if i.get_internal_type() in {\"CharField\", \"TextField\"}:\n                    field = i\n                    break\n\n        # no default name field can be found\n        if field is None:\n            raise ValueError(\n                f\"Do not know which field to use as name file for registry {registry}, please pass field\"\n            )\n        else:\n            field = field.name  # type:ignore\n    if not isinstance(field, str):\n        try:\n            field = field.field.name\n        except AttributeError:\n            raise TypeError(\n                \"please pass a SQLRecord string field, e.g., `CellType.name`!\"\n            ) from None\n\n    return field\n\n\ndef add_db_connection(db: str, using: str):\n    db_config = dj_database_url.config(\n        default=db, conn_max_age=600, conn_health_checks=True\n    )\n    db_config[\"TIME_ZONE\"] = \"UTC\"\n    db_config[\"OPTIONS\"] = {}\n    db_config[\"AUTOCOMMIT\"] = True\n    connections.settings[using] = db_config\n\n\nREGISTRY_UNIQUE_FIELD = {\"storage\": \"root\", \"ulabel\": \"name\"}\n\n\ndef update_fk_to_default_db(\n    records: SQLRecord | list[SQLRecord] | QuerySet,\n    fk: str,\n    using_key: str | None,\n    transfer_logs: dict,\n):\n    # here in case it is an iterable, we are checking only a single record\n    # and set the same fks for all other records because we do this only\n    # for certain fks where they have to the same for the whole bulk\n    # see transfer_fk_to_default_db_bulk\n    # todo: but this has to be changed i think, it is not safe as it is now - Sergei\n    record = records[0] if isinstance(records, (list, QuerySet)) else records\n    if getattr(record, f\"{fk}_id\", None) is not None:\n        # set the space of the transferred record to the current space\n        if fk == \"space\":\n            # for space we set the record's space to the current space\n            from lamindb import context\n\n            # the default space has id=1\n            fk_record_default = Space.get(1) if context.space is None else context.space\n        # process non-space fks\n        else:\n            fk_record = getattr(record, fk)\n            field = REGISTRY_UNIQUE_FIELD.get(fk, \"uid\")\n            fk_record_default = fk_record.__class__.filter(\n                **{field: getattr(fk_record, field)}\n            ).one_or_none()\n            if fk_record_default is None:\n                from copy import copy\n\n                fk_record_default = copy(fk_record)\n                transfer_to_default_db(\n                    fk_record_default, using_key, save=True, transfer_logs=transfer_logs\n                )\n        # re-set the fks to the newly saved ones in the default db\n        if isinstance(records, (list, QuerySet)):\n            for r in records:\n                setattr(r, f\"{fk}\", None)\n                setattr(r, f\"{fk}_id\", fk_record_default.id)\n        else:\n            setattr(records, f\"{fk}\", None)\n            setattr(records, f\"{fk}_id\", fk_record_default.id)\n\n\nFKBULK = [\n    \"organism\",\n    \"source\",\n    \"report\",  # Run\n]\n\n\ndef transfer_fk_to_default_db_bulk(\n    records: list | QuerySet, using_key: str | None, transfer_logs: dict\n):\n    for fk in FKBULK:\n        update_fk_to_default_db(records, fk, using_key, transfer_logs=transfer_logs)\n\n\ndef get_transfer_run(record) -> Run:\n    from lamindb import settings\n    from lamindb.core._context import context\n    from lamindb.models import Run, Transform\n    from lamindb.models.artifact import WARNING_RUN_TRANSFORM\n\n    slug = record._state.db\n    owner, name = get_owner_name_from_identifier(slug)\n    cache_using_filepath = (\n        ln_setup.settings.cache_dir / f\"instance--{owner}--{name}--uid.txt\"\n    )\n    if not cache_using_filepath.exists():\n        raise SystemExit(\"Need to call .connect() before\")\n    instance_uid = cache_using_filepath.read_text().split(\"\\n\")[0]\n    # TODO: consider renaming to __lamindb_sync__\n    key = f\"__lamindb_transfer__/{instance_uid}\"\n    uid = instance_uid + \"0000\"\n    transform = Transform.filter(uid=uid).one_or_none()\n    if transform is None:\n        search_names = settings.creation.search_names\n        settings.creation.search_names = False\n        # TODO: consider renaming to \"Sync from\"\n        transform = Transform(  # type: ignore\n            uid=uid, description=f\"Transfer from `{slug}`\", key=key, kind=\"function\"\n        ).save()\n        settings.creation.search_names = search_names\n    # use the global run context to get the initiated_by_run run id\n    if context.run is not None:\n        initiated_by_run = context.run\n    else:\n        if not settings.creation.artifact_silence_missing_run_warning:\n            logger.warning(WARNING_RUN_TRANSFORM)\n        initiated_by_run = None\n    # it doesn't seem to make sense to create new runs for every transfer\n    run = Run.filter(transform=transform, initiated_by_run=initiated_by_run).first()\n    if run is None:\n        run = Run(transform=transform, initiated_by_run=initiated_by_run).save()  # type: ignore\n        run.initiated_by_run = initiated_by_run  # so that it's available in memory\n    return run\n\n\ndef transfer_to_default_db(\n    record: SQLRecord,\n    using_key: str | None,\n    *,\n    transfer_logs: dict,\n    save: bool = False,\n    transfer_fk: bool = True,\n) -> SQLRecord | None:\n    if record._state.db is None or record._state.db == \"default\":\n        return None\n    registry = record.__class__\n    logger.debug(f\"transferring {registry.__name__} record {record.uid} to default db\")\n    record_on_default = registry.objects.filter(uid=record.uid).one_or_none()\n    record_str = f\"{record.__class__.__name__}(uid='{record.uid}')\"\n    if transfer_logs[\"run\"] is None:\n        transfer_logs[\"run\"] = get_transfer_run(record)\n    if record_on_default is not None:\n        transfer_logs[\"mapped\"].append(record_str)\n        return record_on_default\n    else:\n        transfer_logs[\"transferred\"].append(record_str)\n\n    if hasattr(record, \"created_by_id\"):\n        record.created_by = None\n        record.created_by_id = ln_setup.settings.user.id\n    # run & transform\n    run = transfer_logs[\"run\"]\n    if hasattr(record, \"run_id\"):\n        record.run = None\n        record.run_id = run.id\n    # deal with denormalized transform FK on artifact and collection\n    if hasattr(record, \"transform_id\"):\n        record.transform = None\n        record.transform_id = run.transform_id\n    # transfer other foreign key fields\n    fk_fields = [\n        i.name\n        for i in record._meta.fields\n        if i.get_internal_type() == \"ForeignKey\"\n        if i.name not in {\"created_by\", \"run\", \"transform\", \"branch\"}\n    ]\n    if not transfer_fk:\n        # don't transfer fk fields that are already bulk transferred\n        fk_fields = [fk for fk in fk_fields if fk not in FKBULK]\n    for fk in fk_fields:\n        update_fk_to_default_db(record, fk, using_key, transfer_logs=transfer_logs)\n    record.id = None\n    record._state.db = \"default\"\n    if save:\n        record.save()\n    return None\n\n\ndef track_current_name_value(record: SQLRecord):\n    # below, we're using __dict__ to avoid triggering the refresh from the database\n    # which can lead to a recursion\n    if hasattr(record, \"_name_field\"):\n        record._old_name = record.__dict__.get(record._name_field)\n\n\ndef check_name_change(record: SQLRecord):\n    \"\"\"Warns if a record's name has changed.\"\"\"\n    from lamindb.models import (\n        Artifact,\n        Collection,\n        Feature,\n        Schema,\n        Storage,\n        Transform,\n    )\n\n    if (\n        not record.pk\n        or not hasattr(record, \"_old_name\")\n        or not hasattr(record, \"_name_field\")\n    ):\n        return\n\n    # key-like records are not checked here\n    if isinstance(record, (Artifact, Collection, Transform)):\n        return\n\n    # renaming feature sets is not checked\n    if isinstance(record, Schema):\n        return\n\n    old_name = record._old_name\n    new_name = getattr(record, record._name_field)\n    registry = record.__class__.__name__\n\n    if old_name != new_name:\n        if hasattr(record, \"artifacts\") and not isinstance(record, Storage):\n            linked_records = (\n                # find all artifacts that are linked to this label via a feature with dtype\n                # matching on the name aka \"[registry]\"\n                record.artifacts.through.filter(\n                    feature___dtype_str__contains=f\"[{registry}]\",\n                    **{f\"{registry.lower()}_id\": record.pk},\n                )\n            )\n            artifact_uids = list(set(linked_records.to_list(\"artifact__uid\")))\n            n = len(artifact_uids)\n            if n > 0:\n                s = \"s\" if n > 1 else \"\"\n                es = \"es\" if n == 1 else \"\"\n                logger.error(\n                    f\"by {colors.red('renaming label')} from '{old_name}' to '{new_name}' \"\n                    f\"{n} artifact{s} no longer match{es} the label name in storage: {artifact_uids}\\n\\n\"\n                    f\"   → consider re-curating\\n\"\n                )\n        elif isinstance(record, Feature):\n            # only internal features of schemas with `itype=Feature` are prone to getting out of sync\n            artifact_uids = Artifact.filter(\n                schemas__features=record, schemas__itype=\"Feature\"\n            ).to_list(\"uid\")\n            n = len(artifact_uids)\n            if n > 0:\n                s = \"s\" if n > 1 else \"\"\n                es = \"es\" if n == 1 else \"\"\n                logger.warning(\n                    f\"by {colors.red('renaming feature')} from '{old_name}' to '{new_name}' \"\n                    f\"{n} artifact{s} no longer match{es} the feature name in storage: {artifact_uids}\\n\"\n                    \"  → consider re-curating\"\n                )\n\n\ndef format_field_value(value: datetime | str | Any, none: str = \"None\") -> str:\n    from datetime import datetime\n\n    if isinstance(value, datetime):\n        return value.strftime(\"%Y-%m-%d %H:%M:%S %Z\")\n    if isinstance(value, str):\n        try:\n            value = datetime.fromisoformat(value)\n            value = value.strftime(\"%Y-%m-%d %H:%M:%S %Z\")\n        except ValueError:\n            pass\n        return f\"'{value}'\"\n    if value is None:\n        return none\n    return str(value)\n\n\nclass SQLRecordInfo:\n    def __init__(self, registry: Registry):\n        self.registry = registry\n\n    def _get_type_for_field(self, field_name: str) -> str:\n        field = self.registry._meta.get_field(field_name)\n        related_model_name = (\n            field.related_model.__name__\n            if hasattr(field, \"related_model\") and field.related_model\n            else None\n        )\n        return related_model_name if related_model_name else field.get_internal_type()\n\n    def _get_base_class_fields(self) -> list[str]:\n        return [\n            field.name\n            for base in self.registry.__bases__\n            if hasattr(base, \"_meta\")\n            for field in base._meta.get_fields()\n        ]\n\n    def _reorder_fields_by_class(self, fields_to_order: list[Field]) -> list[Field]:\n        \"\"\"Reorders the fields so that base class fields come last.\"\"\"\n        non_base_class_fields = [\n            field\n            for field in fields_to_order\n            if field.name not in self._get_base_class_fields()\n        ]\n        found_base_class_fields = [\n            field\n            for field in fields_to_order\n            if field.name in self._get_base_class_fields()\n        ]\n        return non_base_class_fields + found_base_class_fields\n\n    def get_simple_fields(self, return_str: bool = False) -> Any:\n        simple_fields = [\n            field\n            for field in self.registry._meta.get_fields()\n            if not (\n                isinstance(field, ManyToOneRel)\n                or isinstance(field, ManyToManyRel)\n                or isinstance(field, ManyToManyField)\n                or isinstance(field, ForeignKey)\n                or field.name.startswith(\"_\")\n                or field.name == \"id\"\n            )\n        ]\n        simple_fields = self._reorder_fields_by_class(simple_fields)\n        if not return_str:\n            return simple_fields\n        else:\n            repr_str = f\"  {colors.italic('Simple fields')}\\n\"\n            if simple_fields:\n                repr_str += \"\".join(\n                    [\n                        f\"    .{field_name.name}: {self._get_type_for_field(field_name.name)}\\n\"\n                        for field_name in simple_fields\n                    ]\n                )\n            return repr_str\n\n    def get_relational_fields(self, return_str: bool = False):\n        # we ignore ManyToOneRel because it leads to so much clutter in the API\n        # also note that our general guideline is to have related_name=\"+\"\n        # for ForeignKey fields\n        relational_fields = (ManyToOneRel, ManyToManyRel, ManyToManyField, ForeignKey)\n\n        class_specific_relational_fields = [\n            field\n            for field in self.registry._meta.fields + self.registry._meta.many_to_many\n            if isinstance(field, relational_fields)\n            and not field.name.startswith((\"links_\", \"_\"))\n        ]\n\n        non_class_specific_relational_fields = [\n            field\n            for field in self.registry._meta.get_fields()\n            if isinstance(field, relational_fields)\n            and not field.name.startswith((\"links_\", \"_\"))\n        ]\n        non_class_specific_relational_fields = self._reorder_fields_by_class(\n            non_class_specific_relational_fields\n        )\n\n        # Ensure that class specific fields (e.g. Artifact) come before non-class specific fields (e.g. collection)\n        filtered_non_class_specific = [\n            field\n            for field in non_class_specific_relational_fields\n            if field not in class_specific_relational_fields\n        ]\n        ordered_relational_fields = (\n            class_specific_relational_fields + filtered_non_class_specific\n        )\n\n        # For Record class, move linked_in fields to the end\n        if self.registry.__name__ == \"Record\":\n            regular_fields = [\n                f\n                for f in ordered_relational_fields\n                if not f.name.startswith((\"linked_\", \"values_\"))\n            ]\n            linked_fields = [\n                f for f in ordered_relational_fields if f.name.startswith(\"linked_\")\n            ]\n            values_fields = [\n                f for f in ordered_relational_fields if f.name.startswith(\"values_\")\n            ]\n            ordered_relational_fields = regular_fields + linked_fields + values_fields\n\n        core_module_fields = []\n        external_modules_fields = []\n        for field in ordered_relational_fields:\n            field_name = repr(field).split(\": \")[1][:-1]\n            if field_name.count(\".\") == 1 and \"lamindb\" not in field_name:\n                external_modules_fields.append(field)\n            else:\n                core_module_fields.append(field)\n\n        def _get_related_field_type(field) -> str:\n            model_name = field.related_model.__get_name_with_module__()\n            # Extract the class name (after the last dot if there's a module prefix)\n            class_name = model_name.split(\".\")[-1]\n            # Skip replacement for compound names like ArtifactBlock, FeatureBlock, etc.\n            if class_name.endswith(\"Block\"):\n                # Return just the class name for Block types\n                field_type = class_name\n            else:\n                field_type = (\n                    model_name.replace(\n                        \"Artifact\", \"\"\n                    ).replace(  # some fields have an unnecessary 'Artifact' in their name\n                        \"Collection\", \"\"\n                    )  # some fields have an unnecessary 'Collection' in their name\n                )\n            return (\n                self._get_type_for_field(field.name)\n                if not field_type.strip()\n                else field_type\n            )\n\n        core_module_fields_formatted = [\n            f\"    .{field.name}: {_get_related_field_type(field)}\\n\"\n            for field in core_module_fields\n        ]\n        external_modules_fields_formatted = [\n            f\"    .{field.name}: {_get_related_field_type(field)}\\n\"\n            for field in external_modules_fields\n        ]\n\n        if not return_str:\n            external_modules_fields_by_modules = defaultdict(list)\n            for field_str, field in zip(\n                external_modules_fields_formatted, external_modules_fields\n            ):\n                field_type = field_str.split(\":\")[1].split()[0]\n                module_name = field_type.split(\".\")[0]\n                external_modules_fields_by_modules[module_name].append(field)\n            return core_module_fields, external_modules_fields_by_modules\n        else:\n            repr_str = \"\"\n\n            # Non-external relational fields\n            if core_module_fields:\n                repr_str += f\"  {colors.italic('Relational fields')}\\n\"\n                repr_str += \"\".join(core_module_fields_formatted)\n\n            # External relational fields\n            external_modules = set()\n            for field in external_modules_fields_formatted:\n                field_type = field.split(\":\")[1].split()[0]\n                external_modules.add(field_type.split(\".\")[0])\n\n            if external_modules:\n                # We want Bionty to show up before other modules\n                external_modules = (\n                    [\"bionty\"] + sorted(external_modules - {\"bionty\"})  # type: ignore\n                    if \"bionty\" in external_modules\n                    else sorted(external_modules)\n                )\n                for ext_module in external_modules:\n                    ext_module_fields = [\n                        field\n                        for field in external_modules_fields_formatted\n                        if ext_module in field\n                    ]\n\n                    if ext_module_fields:\n                        repr_str += (\n                            f\"  {colors.italic(f'{ext_module.capitalize()} fields')}\\n\"\n                        )\n                        repr_str += \"\".join(ext_module_fields)\n\n            return repr_str\n\n\nclass Migration(BaseSQLRecord):\n    app = CharField(max_length=255)\n    name = CharField(max_length=255)\n    applied: datetime = DateTimeField()\n\n    class Meta:\n        db_table = \"django_migrations\"\n        app_label = \"lamindb\"\n        managed = False\n\n\nLinkORM = IsLink  # backward compat\nRecord = SQLRecord  # backward compat\nBasicRecord = BaseSQLRecord  # backward compat\nRecordInfo = SQLRecordInfo  # backward compat\n"
  },
  {
    "path": "lamindb/models/storage.py",
    "content": "from __future__ import annotations\n\nfrom typing import (\n    TYPE_CHECKING,\n    overload,\n)\nfrom uuid import UUID\n\nfrom django.db import models\nfrom lamin_utils import logger\nfrom lamindb_setup import settings as setup_settings\nfrom lamindb_setup.core._hub_core import (\n    delete_storage_record,\n    get_storage_records_for_instance,\n    select_space,\n    update_storage_with_space,\n)\nfrom lamindb_setup.core._settings_storage import (\n    StorageSettings,\n    get_storage_type,\n    init_storage,\n)\nfrom lamindb_setup.core.upath import check_storage_is_empty, create_path\n\nfrom lamindb.base.fields import (\n    CharField,\n    TextField,\n)\n\nfrom ..base.uids import base62_12\nfrom .run import TracksRun, TracksUpdates\nfrom .sqlrecord import Space, SQLRecord\n\nif TYPE_CHECKING:\n    from lamindb_setup.types import StorageType\n    from upath import UPath\n\n    from .artifact import Artifact\n\n\nclass Storage(SQLRecord, TracksRun, TracksUpdates):\n    \"\"\"Storage locations of artifacts such as local directories or S3 buckets.\n\n    A storage location is either a directory (local or a folder in the cloud) or\n    an entire S3/GCP bucket.\n\n    A storage location is written to by at most one LaminDB instance: the location’s *managing instance*.\n    Some locations are not managed with LaminDB and, hence, do not have a managing instance.\n\n    .. dropdown:: Writable vs. read-only storage locations\n\n        The `instance_uid` field of `Storage` defines its *managing instance*.\n        Only if a storage location's `instance_uid` matches your current instance's `uid` (`ln.settings.instance_uid`),\n        you can write to it.\n        All other storage locations are read-only in your current instance.\n\n        Here is an example (`source <https://lamin.ai/laminlabs/lamindata/transform/dPco79GYgzag0000>`__).\n\n        .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/eHDmIOAxLEoqZ2oK0000.png\n           :width: 400px\n\n        Some storage locations are not managed by any LaminDB instance, hence, their `instance_uid` is `None`.\n\n    .. dropdown:: Managing access to storage locations across instances\n\n        You can manage access through LaminHub's fine-grained access management or\n        through AWS policies that you attach to your S3 bucket.\n\n        To enable access management via LaminHub, head over to `https://lamin.ai/{account}/infrastructure`.\n        By clicking the green button that says \"Connect S3 bucket\", your collaborators will access data\n        based on their LaminHub permissions.\n        :doc:`docs:permissions` has more details.\n\n        .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/ze8hkgVxVptSSZEU0000.png\n           :width: 800px\n\n        By default, a storage location inherits the access permissions of its instance. If you\n        want to further restrict access to a storage location, you can move it into a space::\n\n            space = ln.Space.get(name=\"my-space\")\n            storage_loc = ln.Storage.get(root=\"s3://my-storage-location\")\n            storage_loc.space = space\n            storage_loc.save()\n\n        If you don't want to store data in the cloud, you can use local storage locations: :doc:`faq/keep-artifacts-local`.\n\n    Args:\n        root: `str` The root path of the storage location, e.g., `\"./mydir\"`, `\"s3://my-bucket\"`, `\"s3://my-bucket/myfolder\"`, `\"gs://my-bucket/myfolder\"`, `\"/nfs/shared/datasets/genomics\"`, `\"/weka/shared/models/\"`, ...\n        description: `str | None = None` An optional description.\n        space: `Space | None = None` A space to restrict access permissions to the storage location.\n        host: `str | None = None` For local storage locations, a globally unique identifier for the physical machine/server hosting the storage.\n            This distinguishes storage locations that may have the same local path but exist on different servers, e.g. `\"my-institute-cluster-1\"`, `\"my-server-abcd\"`.\n\n    See Also:\n        :attr:`lamindb.core.Settings.storage`\n            Current default storage location of your compute session for writing artifacts.\n        :attr:`~lamindb.setup.core.StorageSettings`\n            Storage settings.\n        :doc:`faq/keep-artifacts-local`\n            Avoid storing artifacts in the cloud, but keep them on local infrastructure.\n\n    Examples:\n\n        When you create a LaminDB instance, you configure its default storage location via `--storage`::\n\n            lamin init --storage ./mydatadir  # or \"s3://my-bucket/myfolder\", \"gs://my-bucket/myfolder\", ...\n\n        View the current default storage location for writing artifacts::\n\n            import lamindb as ln\n\n            ln.settings.storage\n\n        Create a new cloud storage location::\n\n            ln.Storage(root=\"s3://our-bucket/our-folder\").save()\n\n        Create a new local storage location::\n\n            ln.Storage(root=\"/dir/our-shared-dir\", host=\"our-server-123\").save()\n\n        Globally switch to another storage location::\n\n            ln.settings.storage = \"/dir/our-shared-dir\"  # or \"s3://our-bucket/our-folder\", \"gs://our-bucket/our-folder\", ...\n\n        Or if you're operating in `keep-artifacts-local` mode (:doc:`faq/keep-artifacts-local`)::\n\n            ln.settings.local_storage = \"/dir/our-other-shared-dir\"\n\n        View all storage locations used in your LaminDB instance::\n\n            ln.Storage.to_dataframe()\n\n    Notes:\n\n        .. dropdown:: What is the `.lamindb/` directory inside a storage location?\n\n            It stores all artifacts that are ingested through `lamindb`, indexed by the artifact `uid`.\n            This means you don't have to worry about renaming or moving files, as this all happens on the database level.\n\n            Existing artifacts are typically stored in hierarchical structures with semantic folder names.\n            Instead of copying such artifacts into `.lamindb/` upon calls of `Artifact(\"legacy_path\").save()`,\n            LaminDB registers them with the semantic `key` representing the relative path within the storage location.\n            These artifacts are marked with `artifact._key_is_virtual = False` and treated correspondingly.\n\n            There is only a single `.lamindb/` directory per storage location.\n\n        .. dropdown:: What should I do if I want to bulk migrate all artifacts to another storage?\n\n            Currently, you can only achieve this manually and you should be careful with it.\n\n            1. Copy or move artifacts into the desired new storage location\n            2. Adapt the corresponding record in the {class}`~lamindb.Storage` registry by setting the `root` field to the new location\n            3. If your LaminDB storage location is connected to the hub, you also need to update the storage record on the hub\n\n    \"\"\"\n\n    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):\n        abstract = False\n        app_label = \"lamindb\"\n\n    _name_field: str = \"root\"\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(\n        editable=False, unique=True, max_length=12, default=base62_12, db_index=True\n    )\n    \"\"\"Universal id, valid across DB instances.\"\"\"\n    root: str = CharField(db_index=True, unique=True)\n    \"\"\"Root path of storage (cloud or local path).\"\"\"\n    description: str | None = TextField(null=True)\n    \"\"\"A description.\"\"\"\n    type: StorageType = CharField(max_length=30, db_index=True)\n    \"\"\"Can be \"local\" vs. \"s3\" vs. \"gs\". Is auto-detected from the format of the `root` path.\"\"\"\n    region: str | None = CharField(max_length=64, db_index=True, null=True)\n    \"\"\"Storage region for cloud storage locations. Host identifier for local storage locations.\"\"\"\n    instance_uid: str | None = CharField(max_length=12, db_index=True, null=True)\n    \"\"\"The writing instance.\n\n    Only the LaminDB instance with this `uid` can write to this storage location.\n    This instance also governs the access permissions of the storage location unless the location is moved into a space.\n    \"\"\"\n    artifacts: Artifact\n    \"\"\"Artifacts contained in this storage location.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        root: str,\n        *,\n        description: str | None = None,\n        space: Space | None = None,\n        host: str | None = None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        if len(args) == len(self._meta.concrete_fields):\n            super().__init__(*args)\n            self._old_space_id = self.space_id\n            return None\n        if args:\n            assert len(args) == 1, (  # noqa: S101\n                \"Storage can only be initialized with a single positional argument, the root path.\"\n            )\n            kwargs[\"root\"] = args[0]\n        if \"host\" in kwargs:\n            if \"type\" in kwargs:\n                assert kwargs[\"type\"] == \"local\", (  # noqa: S101\n                    \"type needs to be 'local' if host is set\"\n                )\n            else:\n                kwargs[\"type\"] = \"local\"\n            assert get_storage_type(kwargs[\"root\"]) == \"local\", (  # noqa: S101\n                \"root must be a local path if host is set\"\n            )\n            assert \"region\" not in kwargs, \"region must not be set if host is set\"  # noqa: S101\n            kwargs[\"region\"] = kwargs.pop(\"host\")\n            storage_record = Storage.filter(\n                root=kwargs[\"root\"], region=kwargs[\"region\"]\n            ).one_or_none()\n        else:\n            storage_record = Storage.filter(root=kwargs[\"root\"]).one_or_none()\n        space = kwargs.get(\"space\", None)\n        if storage_record is not None:\n            from .sqlrecord import init_self_from_db\n\n            init_self_from_db(self, storage_record)\n            self._old_space_id = self.space_id\n            return None\n\n        skip_mark_storage_root = kwargs.pop(\"skip_mark_storage_root\", False)\n\n        skip_preparation = kwargs.pop(\"_skip_preparation\", False)\n        if skip_preparation:\n            assert space is None, \"`space` must not be set if _skip_preparation is True\"  # noqa: S101\n            super().__init__(*args, **kwargs)\n            return None\n\n        space_uuid = None\n        if space is not None:\n            hub_space_record = select_space(space.uid)\n            if hub_space_record is None:\n                raise ValueError(\n                    \"Please first create a space on the hub: https://docs.lamin.ai/access\"\n                )\n            space_uuid = UUID(hub_space_record[\"id\"])\n\n        # instance_id won't take effect if\n        # - there is no write access\n        # - the storage location is already managed by another instance\n        ssettings, _ = init_storage(\n            kwargs[\"root\"],\n            instance_id=setup_settings.instance._id,\n            instance_slug=setup_settings.instance.slug,\n            register_hub=setup_settings.instance.is_on_hub,\n            region=kwargs.get(\"region\", None),  # host was renamed to region already\n            space_uuid=space_uuid,\n            skip_mark_storage_root=skip_mark_storage_root,\n        )\n        # ssettings performed validation and normalization of the root path\n        kwargs[\"root\"] = ssettings.root_as_str  # noqa: S101\n        if \"instance_uid\" in kwargs:\n            assert kwargs[\"instance_uid\"] == ssettings.instance_uid  # noqa: S101\n        else:\n            kwargs[\"instance_uid\"] = ssettings.instance_uid\n        if ssettings._uid is not None:  # need private attribute here\n            kwargs[\"uid\"] = ssettings._uid\n        if \"type\" not in kwargs:\n            kwargs[\"type\"] = ssettings.type\n        else:\n            assert kwargs[\"type\"] == ssettings.type  # noqa: S101\n        if \"region\" in kwargs:\n            assert kwargs[\"region\"] == ssettings.region  # noqa: S101\n        else:\n            kwargs[\"region\"] = ssettings.region\n\n        is_managed_by_current_instance = (\n            ssettings.instance_uid == setup_settings.instance.uid\n        )\n        if ssettings.instance_uid is not None and not is_managed_by_current_instance:\n            is_managed_by_instance = (\n                f\", is managed by instance with uid {ssettings.instance_uid}\"\n            )\n        else:\n            is_managed_by_instance = \"\"\n        hub_message = \"\"\n        if setup_settings.instance.is_on_hub and is_managed_by_current_instance:\n            instance_owner = setup_settings.instance.owner\n            ui_url = setup_settings.instance.ui_url\n            hub_message = f\", see: {ui_url}/{instance_owner}/infrastructure\"\n        managed_message = (\n            \"created managed\"\n            if is_managed_by_current_instance\n            else \"referenced read-only\"\n        )\n        logger.important(\n            f\"{managed_message} storage location at {kwargs['root']}{is_managed_by_instance}{hub_message}\"\n        )\n        super().__init__(**kwargs)\n        self._old_space_id = self.space_id\n\n    @property\n    def host(self) -> str | None:\n        \"\"\"Host identifier for local storage locations.\n\n        Is `None` for locations with `type != \"local\"`.\n\n        A globally unique user-defined host identifier (cluster, server, laptop, etc.).\n        \"\"\"\n        if self.type != \"local\":\n            return None\n        return self.region\n\n    @property\n    def path(self) -> UPath:\n        \"\"\"Path.\n\n        Uses the `.root` field and converts it into a `Path` or `UPath`.\n        \"\"\"\n        access_token = self._access_token if hasattr(self, \"_access_token\") else None\n        return create_path(self.root, access_token=access_token)\n\n    def save(self, *args, **kwargs):\n        \"\"\"Save the storage record.\"\"\"\n        if hasattr(self, \"_old_space_id\") and self._old_space_id != self.space_id:\n            update_storage_with_space(storage_lnid=self.uid, space_lnid=self.space.uid)\n        super().save(*args, **kwargs)\n        return self\n\n    def delete(self, permanent: bool | None = None) -> None:  # type: ignore\n        # type ignore is there because we don't use a trash here unlike everywhere else\n        \"\"\"Delete the storage location.\n\n        This errors in case the storage location is not empty.\n\n        Unlike other `SQLRecord`-based registries, this does *not* move the storage record into the trash.\n\n        Args:\n            permanent: `False` raises an error, as soft delete is impossible.\n        \"\"\"\n        from .. import settings\n\n        if permanent is False:\n            raise ValueError(\n                \"Soft delete is not possible for Storage, \"\n                \"use 'permanent=True' or 'permanent=None' for permanent deletion.\"\n            )\n        assert not self.artifacts.exists(), (\n            \"Cannot delete storage with artifacts in current instance.\"\n        )  # noqa: S101\n        # the simple case of a read-only storage location\n        if self.instance_uid != setup_settings.instance.uid:\n            super(SQLRecord, self).delete()\n            return None\n        # now the complicated case of a written/managed storage location\n        check_storage_is_empty(self.path)\n        assert settings.storage.root_as_str != self.root, (  # noqa: S101\n            \"Cannot delete the current storage location, switch to another.\"\n        )\n        if setup_settings.user.handle != \"anonymous\":  # only attempt if authenticated\n            storage_records = get_storage_records_for_instance(\n                # only query those storage records on the hub that are managed by the current instance\n                setup_settings.instance._id\n            )\n            for storage_record in storage_records:\n                if storage_record[\"lnid\"] == self.uid:\n                    assert storage_record[\"is_default\"] in {False, None}, (  # noqa: S101\n                        \"Cannot delete default storage of instance.\"\n                    )\n                    delete_storage_record(storage_record)\n        ssettings = StorageSettings(self.root)\n        if ssettings._mark_storage_root.exists():\n            ssettings._mark_storage_root.unlink(\n                missing_ok=True  # this is totally weird, but needed on Py3.11\n            )\n        super(SQLRecord, self).delete()\n"
  },
  {
    "path": "lamindb/models/transform.py",
    "content": "from __future__ import annotations\n\nimport warnings\nfrom typing import TYPE_CHECKING, overload\n\nfrom django.db import models\nfrom django.db.models import CASCADE, PROTECT, Q\nfrom lamin_utils import logger\nfrom lamindb_setup.core.hashing import HASH_LENGTH, hash_file, hash_string\n\nfrom lamindb.base import deprecated\nfrom lamindb.base.fields import (\n    CharField,\n    DateTimeField,\n    ForeignKey,\n    TextField,\n)\nfrom lamindb.base.users import current_user_id\n\nfrom .._secret_redaction import redact_secrets_in_source_code\nfrom ..models._is_versioned import process_revises\nfrom ._is_versioned import IsVersioned, _adjust_is_latest_when_deleting_is_versioned\nfrom .run import Run, User\nfrom .sqlrecord import (\n    BaseSQLRecord,\n    IsLink,\n    SQLRecord,\n    init_self_from_db,\n    update_attributes,\n)\n\nif TYPE_CHECKING:\n    from datetime import datetime\n    from pathlib import Path\n\n    from lamindb.base.types import TransformKind\n\n    from .artifact import Artifact\n    from .block import TransformBlock\n    from .project import Project, Reference\n    from .query_manager import RelatedManager\n    from .query_set import QuerySet\n    from .record import Record\n    from .ulabel import ULabel\n\n\n# does not inherit from TracksRun because the Transform\n# is needed to define a run\nclass Transform(SQLRecord, IsVersioned):\n    \"\"\"Data transformations such as scripts, notebooks, functions, or pipelines.\n\n    A `transform` can be a function, a script, a notebook, or a\n    pipeline. If you execute a transform, you generate a run\n    (:class:`~lamindb.Run`). A run has inputs and outputs.\n\n    Pipelines are typically created with a workflow manager (Nextflow, Snakemake,\n    Prefect, Flyte, Dagster, redun, Airflow, ...).\n\n    Transforms are versioned so that a given transform version maps on a given\n    source code version.\n\n    .. dropdown:: Can I sync transforms to git?\n\n        If you set the environment variable `LAMINDB_SYNC_GIT_REPO` or set\n        `ln.settings.sync_git_repo`, a script-like transform is\n        synced to its hashed state in a git repository upon calling `ln.track()`::\n\n            ln.settings.sync_git_repo = \"https://github.com/laminlabs/lamindb\"\n            ln.track()\n\n        If the hash isn't found in the git repository, an error is thrown.\n\n        You can also create transforms that map pipelines via `Transform.from_git()`.\n\n    The definition of transforms and runs is consistent with the OpenLineage\n    specification where a `transform` would be called a \"job\" and a `run` a \"run\".\n\n    Args:\n        key: `str | None = None` A short name or path-like semantic key.\n        kind: `TransformKind | None = \"pipeline\"` See :class:`~lamindb.base.types.TransformKind`.\n        version: `str | None = None` A version string.\n        description: `str | None = None` A description.\n        reference: `str | None = None` A reference, e.g., a URL.\n        reference_type: `str | None = None` A reference type, e.g., 'url'.\n        source_code: `str | None = None` Source code of the transform.\n        revises: `Transform | None = None` An old version of the transform.\n        skip_hash_lookup: `bool = False` Skip the hash lookup so that a new transform is created even if a transform with the same hash already exists.\n\n    See Also:\n        :func:`~lamindb.track`\n            Track a script or notebook run.\n        :class:`~lamindb.Run`\n            Executions of transforms.\n\n    Notes:\n        - :doc:`docs:track`\n        - :doc:`docs:redun`\n        - :doc:`docs:nextflow`\n        - :doc:`docs:snakemake`\n\n    Examples:\n\n        Create a transform by running `ln.track()` in a notebook or a script::\n\n            ln.track()\n\n        Create a transform for a standalone function that acts as its own workflow::\n\n            @ln.flow()\n            def my_workflow():\n                print(\"Hello, world!\")\n\n        Create a transform for a step in a workflow::\n\n            @ln.step()\n            def my_step():\n                print(\"One step!\")\n\n        Create a transform for a pipeline::\n\n            transform = ln.Transform(key=\"Cell Ranger\", version=\"7.2.0\", kind=\"pipeline\").save()\n\n        Create a transform by saving a Python or shell script or a notebook via the CLI::\n\n            lamin save my_script.py\n            lamin save my_script.sh\n            lamin save my_notebook.ipynb\n\n    \"\"\"\n\n    class Meta(SQLRecord.Meta, IsVersioned.Meta):\n        abstract = False\n        app_label = \"lamindb\"\n        unique_together = (\"key\", \"hash\")\n\n    _len_stem_uid: int = 12\n    _len_full_uid: int = 16\n    _name_field: str = \"key\"\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(\n        editable=False, unique=True, db_index=True, max_length=_len_full_uid\n    )\n    \"\"\"Universal id.\"\"\"\n    # the max length equals the max length of an S3 key & the artifact key\n    key: str = CharField(db_index=True, max_length=1024)\n    \"\"\"A name or \"/\"-separated path-like string.\n\n    All transforms with the same key are part of the same version family.\n    \"\"\"\n    # db_index on description because sometimes we query for equality in the case of artifacts\n    description: str | None = TextField(null=True, db_index=True)\n    \"\"\"A description.\"\"\"\n    kind: TransformKind = CharField(\n        max_length=20,\n        db_index=True,\n        default=\"pipeline\",\n    )\n    \"\"\"A string indicating the kind of transform (default `\"pipeline\"`).\n\n    One of `\"pipeline\"`, `\"notebook\"`, `\"script\"`, or `\"function\"`.\n    \"\"\"\n    source_code: str | None = TextField(null=True)\n    \"\"\"Source code of the transform.\"\"\"\n    hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)\n    \"\"\"Hash of the source code.\"\"\"\n    reference: str | None = CharField(max_length=255, db_index=True, null=True)\n    \"\"\"Reference for the transform, e.g., a URL.\"\"\"\n    reference_type: str | None = CharField(max_length=25, db_index=True, null=True)\n    \"\"\"Reference type of the transform, e.g., 'url'.\"\"\"\n    environment: Artifact | None = models.ForeignKey(\n        \"Artifact\", CASCADE, null=True, related_name=\"_environment_of_transforms\"\n    )\n    \"\"\"An environment for executing the transform.\"\"\"\n    plan: Artifact | None = models.ForeignKey(\n        \"Artifact\",\n        CASCADE,\n        null=True,\n        related_name=\"_plan_for_transforms\",\n        default=None,\n    )\n    \"\"\"An optional plan for executing this transform.\"\"\"\n    runs: RelatedManager[Run]\n    \"\"\"Runs of this transform ← :attr:`~lamindb.Run.transform`.\"\"\"\n    ulabels: RelatedManager[ULabel] = models.ManyToManyField(\n        \"ULabel\", through=\"TransformULabel\", related_name=\"transforms\"\n    )\n    \"\"\"ULabel annotations of this transform ← :attr:`~lamindb.ULabel.transforms`.\"\"\"\n    linked_in_records: RelatedManager[Record] = models.ManyToManyField(\n        \"Record\", through=\"RecordTransform\", related_name=\"linked_transforms\"\n    )\n    \"\"\"This transform is linked in these records as a value ← :attr:`~lamindb.Record.linked_transforms`.\"\"\"\n    records: RelatedManager[Record]\n    \"\"\"Records that annotate this transform ← :attr:`~lamindb.Record.transforms`.\"\"\"\n    predecessors: RelatedManager[Transform] = models.ManyToManyField(\n        \"self\",\n        through=\"TransformTransform\",\n        symmetrical=False,\n        related_name=\"successors\",\n    )\n    \"\"\"Preceding transforms ← :attr:`~lamindb.Transform.successors`.\"\"\"\n    successors: RelatedManager[Transform]\n    \"\"\"Subsequent transforms ← :attr:`~lamindb.Transform.predecessors`.\n\n    Allows defining succeeding transforms. Is *not* necessary for data lineage, which is tracked automatically\n    whenever an artifact or collection serves as an input for a run.\n    \"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"Linked projects ← :attr:`~lamindb.Project.transforms`.\"\"\"\n    references: RelatedManager[Reference]\n    \"\"\"Linked references ← :attr:`~lamindb.Reference.transforms`.\"\"\"\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of creation of record.\"\"\"\n    updated_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of last update to record.\"\"\"\n    created_by: User = ForeignKey(\n        User, PROTECT, default=current_user_id, related_name=\"created_transforms\"\n    )\n    \"\"\"Creator of record ← :attr:`~lamindb.User.created_transforms`.\"\"\"\n    ablocks: RelatedManager[TransformBlock]\n    \"\"\"Attached blocks ← :attr:`~lamindb.TransformBlock.transform`.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        key: str | None = None,\n        kind: TransformKind | None = None,\n        version: str | None = None,\n        description: str | None = None,\n        reference: str | None = None,\n        reference_type: str | None = None,\n        source_code: str | None = None,\n        revises: Transform | None = None,\n        skip_hash_lookup: bool = False,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        if len(args) == len(self._meta.concrete_fields):\n            super().__init__(*args, **kwargs)\n            return None\n        if args:\n            raise ValueError(\n                \"Please only use keyword arguments to construct a Transform\"\n            )\n        key: str | None = kwargs.pop(\"key\", None)\n        description: str | None = kwargs.pop(\"description\", None)\n        revises: Transform | None = kwargs.pop(\"revises\", None)\n        version_tag: str | None = kwargs.pop(\"version_tag\", kwargs.pop(\"version\", None))\n        kind: TransformKind | None = kwargs.pop(\"kind\", None)\n        type: TransformKind | None = kwargs.pop(\"type\", None)\n        if type is not None:\n            warnings.warn(\n                \"`type` argument of transform was renamed to `kind` and will be removed in a future release.\",\n                DeprecationWarning,\n                stacklevel=2,\n            )\n        kind = kind if kind is not None else (type if type is not None else \"pipeline\")\n        reference: str | None = kwargs.pop(\"reference\", None)\n        reference_type: str | None = kwargs.pop(\"reference_type\", None)\n        branch = kwargs.pop(\"branch\", None)\n        branch_id = kwargs.pop(\"branch_id\", 1)\n        space = kwargs.pop(\"space\", None)\n        space_id = kwargs.pop(\"space_id\", 1)\n        skip_hash_lookup: bool = kwargs.pop(\"skip_hash_lookup\", False)\n        using_key = kwargs.pop(\"using_key\", None)\n        # below is internal use that we'll hopefully be able to eliminate\n        uid: str | None = kwargs.pop(\"uid\") if \"uid\" in kwargs else None\n        source_code: str | None = (\n            kwargs.pop(\"source_code\") if \"source_code\" in kwargs else None\n        )\n        if not len(kwargs) == 0:\n            raise ValueError(\n                \"Only key, description, version, kind, type, revises, reference, \"\n                f\"reference_type can be passed, but you passed: {kwargs}\"\n            )\n        if revises is None:\n            # need to check uid before checking key\n            if uid is not None:\n                revises = (\n                    Transform.objects.using(using_key)\n                    .filter(uid__startswith=uid[:-4], is_latest=True)\n                    .order_by(\"-created_at\")\n                    .first()\n                )\n            elif key is not None:\n                candidate_for_revises = (\n                    Transform.objects.using(using_key)\n                    .filter(~Q(branch_id=-1), key=key, is_latest=True)\n                    .order_by(\"-created_at\")\n                    .first()\n                )\n                if candidate_for_revises is not None:\n                    revises = candidate_for_revises\n                    if candidate_for_revises.source_code is None:\n                        # no source code was yet saved, return the same transform\n                        logger.important(\n                            \"no source code was yet saved, returning existing transform with same key\"\n                        )\n                        uid = revises.uid\n        if revises is not None and uid is not None and uid == revises.uid:\n            if revises.key != key:\n                logger.warning(\"ignoring inconsistent key\")\n            init_self_from_db(self, revises)\n            update_attributes(self, {\"description\": description})\n            return None\n        if revises is not None and key is not None and revises.key != key:\n            logger.important(f\"renaming transform {revises.key} to {key}\")\n        new_uid, version_tag, key, description, revises = process_revises(\n            revises, version_tag, key, description, Transform\n        )\n        # this is only because the user-facing constructor allows passing a uid\n        # most others don't\n        if uid is None:\n            has_consciously_provided_uid = False\n            uid = new_uid\n        else:\n            has_consciously_provided_uid = True\n        hash = None\n        if source_code is not None and not skip_hash_lookup:\n            hash = hash_string(source_code)\n            transform_candidate = Transform.objects.filter(\n                ~Q(branch_id=-1),\n                hash=hash,\n                is_latest=True,\n            ).first()\n            if transform_candidate is not None:\n                init_self_from_db(self, transform_candidate)\n                update_attributes(self, {\"description\": description})\n                if key is not None and transform_candidate.key != key:\n                    logger.warning(\n                        f\"key {self.key} on existing transform differs from passed key {key}, keeping original key; update manually if needed or pass skip_hash_lookup if you want to duplicate the transform\"\n                    )\n                return None\n        super().__init__(  # type: ignore\n            uid=uid,\n            description=description,\n            key=key,\n            kind=kind,\n            version_tag=version_tag,\n            reference=reference,\n            reference_type=reference_type,\n            source_code=source_code,\n            hash=hash,\n            _has_consciously_provided_uid=has_consciously_provided_uid,\n            revises=revises,\n            branch=branch,\n            branch_id=branch_id,\n            space=space,\n            space_id=space_id,\n        )\n\n    @classmethod\n    def from_git(\n        cls,\n        url: str,\n        path: str,\n        key: str | None = None,\n        version: str | None = None,\n        entrypoint: str | None = None,\n        branch: str | None = None,\n        description: str | None = None,\n        skip_hash_lookup: bool = False,\n    ) -> Transform:\n        \"\"\"Create a transform from a path in a git repository.\n\n        Args:\n            url: URL of the git repository.\n            path: Path to the file within the repository.\n            key: Optional key for the transform.\n            version: Optional version tag to checkout in the repository.\n            entrypoint: One or several optional comma-separated entrypoints for the transform.\n            branch: Optional branch to checkout.\n            description: Optional description for the transform.\n            skip_hash_lookup: Skip the hash lookup so that a new transform is created even if a transform with the same hash already exists.\n\n        Examples:\n\n            Create from a Nextflow repo and auto-infer the commit hash from its latest version::\n\n                transform = ln.Transform.from_git(\n                    url=\"https://github.com/openproblems-bio/task_batch_integration\",\n                    path=\"main.nf\"\n                ).save()\n\n            Create from a Nextflow repo and checkout a specific version::\n\n                transform = ln.Transform.from_git(\n                    url=\"https://github.com/openproblems-bio/task_batch_integration\",\n                    path=\"main.nf\",\n                    version=\"v2.0.0\"\n                ).save()\n                assert transform.version_tag == \"v2.0.0\"\n\n            Create a *sliding transform* from a Nextflow repo's `dev` branch.\n            Unlike a regular transform, a sliding transform doesn't pin a specific source code state,\n            but adapts to whatever the referenced state on the branch is::\n\n                transform = ln.Transform.from_git(\n                    url=\"https://github.com/openproblems-bio/task_batch_integration\",\n                    path=\"main.nf\",\n                    branch=\"dev\",\n                    version=\"dev\",\n                ).save()\n\n        Notes:\n\n            A regular transform pins a specific source code state through its commit hash::\n\n                transform.source_code\n                #> repo: https://github.com/openproblems-bio/task_batch_integration\n                #> path: main.nf\n                #> commit: 68eb2ecc52990617dbb6d1bb5c7158d9893796bb\n\n            A sliding transform infers the source code state from a branch::\n\n                transform.source_code\n                #> repo: https://github.com/openproblems-bio/task_batch_integration\n                #> path: main.nf\n                #> branch: dev\n\n            If an entrypoint is provided, it is added to the source code below the path, e.g.::\n\n                transform.source_code\n                #> repo: https://github.com/openproblems-bio/task_batch_integration\n                #> path: main.nf\n                #> entrypoint: myentrypoint\n                #> commit: 68eb2ecc52990617dbb6d1bb5c7158d9893796bb\n\n            Note that you can pass a comma-separated list of entrypoints to the `entrypoint` argument.\n\n        \"\"\"\n        from ..core._sync_git import get_and_validate_git_metadata\n\n        url, commit_hash = get_and_validate_git_metadata(url, path, version, branch)\n        if key is None:\n            key = (\n                url.split(\"/\")[-2]\n                + \"/\"\n                + url.split(\"/\")[-1].replace(\".git\", \"\")\n                + \"/\"\n                + path\n            )\n            logger.important(f\"inferred key '{key}' from url & path\")\n        source_code = f\"repo: {url}\\npath: {path}\"\n        if entrypoint is not None:\n            source_code += f\"\\nentrypoint: {entrypoint}\"\n        if branch is not None and version == branch:\n            from urllib.parse import quote\n\n            # sliding transform, no defined source code state\n            source_code += f\"\\nbranch: {branch}\"\n            reference, reference_type = (\n                f\"{url}/tree/{quote(branch, safe='')}/{path}\",\n                \"url\",\n            )\n        else:\n            # regular transform, defined source code state\n            source_code += f\"\\ncommit: {commit_hash}\"\n            reference, reference_type = f\"{url}/blob/{commit_hash}/{path}\", \"url\"\n        return Transform(\n            key=key,\n            kind=\"pipeline\",\n            version=version,\n            description=description,\n            reference=reference,\n            reference_type=reference_type,\n            source_code=source_code,\n            skip_hash_lookup=skip_hash_lookup,\n        )\n\n    @property\n    def latest_run(self) -> Run:\n        \"\"\"The latest run of this transform.\"\"\"\n        return self.runs.order_by(\"-started_at\").first()\n\n    @property\n    @deprecated(new_name=\"kind\")\n    def type(self) -> TransformKind:\n        return self.kind\n\n    @type.setter\n    def type(self, value: TransformKind):\n        self.kind = value\n\n    def view_lineage(self, with_successors: bool = False, distance: int = 5):\n        \"\"\"View lineage of transforms.\n\n        Note that this only accounts for manually defined predecessors and successors.\n\n        Auto-generate lineage through inputs and outputs of runs is not included.\n        \"\"\"\n        from .has_parents import view_parents\n\n        return view_parents(\n            record=self,\n            field=\"key\",\n            with_children=with_successors,\n            distance=distance,\n            attr_name=\"predecessors\",\n        )\n\n    def _update_source_code_from_path(self, source_code_path: Path) -> None | str:\n        _, transform_hash, _ = hash_file(source_code_path)  # ignore hash_type for now\n        source_code = source_code_path.read_text()\n        source_code_to_store, redaction_count = redact_secrets_in_source_code(\n            source_code\n        )\n        if redaction_count > 0:\n            logger.warning(\n                f\"redacted {redaction_count} secret-looking assignment(s) before persisting transform source code\"\n            )\n        if self.hash is not None:\n            # check if the hash of the transform source code matches\n            if transform_hash != self.hash:\n                response = input(\n                    f\"You are about to overwrite existing source code (hash '{self.hash}') for Transform('{self.uid}').\"\n                    f\" Proceed? (y/n) \"\n                )\n                if response == \"y\":\n                    self.source_code = source_code_to_store\n                    self.hash = transform_hash\n                else:\n                    logger.warning(\"Please re-run `ln.track()` to make a new version\")\n                    return \"rerun-the-notebook\"\n            else:\n                logger.debug(\"source code is already saved\")\n        else:\n            self.source_code = source_code_to_store\n            self.hash = transform_hash\n        return None\n\n\ndef _permanent_delete_transforms(transforms: Transform | QuerySet) -> None:\n    \"\"\"Execute bulk DELETE on transforms (runs, then transforms). Used by QuerySet and single-transform paths.\"\"\"\n    from django.db.models import QuerySet as DjangoQuerySet\n\n    from .project import TransformProject\n\n    if isinstance(transforms, Transform):\n        db = transforms._state.db or \"default\"\n        qs = Transform.objects.using(db).filter(pk=transforms.pk)\n    else:\n        db = transforms.db or \"default\"\n        qs = transforms\n    objects = list(qs)\n    if not objects:\n        return\n    _adjust_is_latest_when_deleting_is_versioned(objects)\n    transform_ids = [o.pk for o in objects]\n    TransformProject.objects.using(db).filter(transform_id__in=transform_ids).delete()\n    Run.objects.using(db).filter(transform_id__in=transform_ids).delete(permanent=True)\n    DjangoQuerySet.delete(qs)\n\n\nclass TransformTransform(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    successor: Transform = ForeignKey(\n        \"Transform\", CASCADE, related_name=\"links_predecessor\"\n    )\n    predecessor: Transform = ForeignKey(\n        \"Transform\", CASCADE, related_name=\"links_successor\"\n    )\n    config: dict | None = models.JSONField(default=None, null=True)\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now()\n    )\n    created_by: User = ForeignKey(\n        \"lamindb.User\", PROTECT, default=current_user_id, related_name=\"+\"\n    )\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"successor\", \"predecessor\")\n"
  },
  {
    "path": "lamindb/models/ulabel.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, overload\n\nimport pgtrigger\nfrom django.conf import settings as django_settings\nfrom django.db import models\nfrom django.db.models import CASCADE, PROTECT\n\nfrom lamindb.base.fields import (\n    CharField,\n    DateTimeField,\n    ForeignKey,\n    TextField,\n)\nfrom lamindb.errors import FieldValidationError\n\nfrom ..base.uids import base62_8\nfrom .can_curate import CanCurate\nfrom .feature import Feature\nfrom .has_parents import HasParents, _query_relatives\nfrom .run import Run, TracksRun, TracksUpdates, User, current_user_id\nfrom .sqlrecord import BaseSQLRecord, HasType, IsLink, SQLRecord, _get_record_kwargs\nfrom .transform import Transform\n\nif TYPE_CHECKING:\n    from datetime import datetime\n\n    from .artifact import Artifact\n    from .block import ULabelBlock\n    from .collection import Collection\n    from .project import Project\n    from .query_manager import RelatedManager\n    from .query_set import QuerySet\n    from .record import Record\n    from .sqlrecord import Branch\n\n\nclass ULabel(SQLRecord, HasType, HasParents, CanCurate, TracksRun, TracksUpdates):\n    \"\"\"Universal labels.\n\n    It behaves like `Record`, just without the ability to link features.\n\n    Args:\n        name: `str` A name.\n        description: `str | None = None` A description.\n        reference: `str | None = None` For instance, an external ID or a URL.\n        reference_type: `str | None = None` For instance, `\"url\"`.\n\n    See Also:\n        :class:`~lamindb.Record`\n            Like `ULabel`, but with the ability to link features.\n\n    Examples:\n\n        Create a label and annotate an :class:`~lamindb.Artifact`::\n\n            train_split = ln.ULabel(name=\"train\").save()\n            artifact.ulabels.add(train_split)\n\n        Query artifacts by label::\n\n            ln.Artifact.filter(ulabels=train_split).to_dataframe()\n\n        Organize ulabels in a type hierarchy, based on the `type` field::\n\n            split_type = ln.ULabel(name=\"Split\", is_type=True).save()\n            train_split = ln.ULabel(name=\"train\", type=\"split_type\").save()\n\n        The `type` hierarchy gives rise to a tree. If you need to model a full DAG-like **ontology**, use the `parents`/`children` fields::\n\n            cell_type = ln.Record(name=\"CellType\", is_type=True).save()\n            t_cell = ln.Record(name=\"T Cell\", type=cell_type).save()\n            cd4_t_cell = ln.Record(name=\"CD4+ T Cell\", type=cell_type).save()\n            t_cell.children.add(cd4_t_cell)\n\n        If you work with basic biological entities like cell lines, cell types, tissues,\n        consider building on the public biological ontologies in :mod:`bionty`,\n        which work in the same way.\n\n    \"\"\"\n\n    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):\n        abstract = False\n        app_label = \"lamindb\"\n        if (\n            django_settings.DATABASES.get(\"default\", {}).get(\"ENGINE\")\n            == \"django.db.backends.postgresql\"\n        ):\n            triggers = [\n                pgtrigger.Trigger(\n                    name=\"prevent_ulabel_type_cycle\",\n                    operation=pgtrigger.Update | pgtrigger.Insert,\n                    when=pgtrigger.Before,\n                    condition=pgtrigger.Condition(\"NEW.type_id IS NOT NULL\"),\n                    func=\"\"\"\n                        -- Check for direct self-reference\n                        IF NEW.type_id = NEW.id THEN\n                            RAISE EXCEPTION 'Cannot set type: ulabel cannot be its own type';\n                        END IF;\n\n                        -- Check for cycles in the type chain\n                        IF EXISTS (\n                            WITH RECURSIVE type_chain AS (\n                                SELECT type_id, 1 as depth\n                                FROM lamindb_ulabel\n                                WHERE id = NEW.type_id\n\n                                UNION ALL\n\n                                SELECT r.type_id, tc.depth + 1\n                                FROM lamindb_ulabel r\n                                INNER JOIN type_chain tc ON r.id = tc.type_id\n                                WHERE tc.depth < 100\n                            )\n                            SELECT 1 FROM type_chain WHERE type_id = NEW.id\n                        ) THEN\n                            RAISE EXCEPTION 'Cannot set type: would create a cycle';\n                        END IF;\n\n                        RETURN NEW;\n                    \"\"\",\n                ),\n            ]\n        # also see raw SQL constraints for `is_type` and `type` FK validity in migrations\n\n    _name_field: str = \"name\"\n\n    id: int = models.AutoField(primary_key=True)\n    \"\"\"Internal id, valid only in one DB instance.\"\"\"\n    uid: str = CharField(\n        editable=False, unique=True, db_index=True, max_length=8, default=base62_8\n    )\n    \"\"\"A universal random id, valid across DB instances.\"\"\"\n    name: str = CharField(max_length=150, db_index=True)\n    \"\"\"Name or title of ulabel.\"\"\"\n    type: ULabel | None = ForeignKey(\"self\", PROTECT, null=True, related_name=\"ulabels\")\n    \"\"\"Type of ulabel, e.g., `\"donor\"`, `\"split\"`, etc. ← :attr:`~lamindb.ULabel.ulabels`\n\n    Allows to group ulabels by type, e.g., all donors, all split ulabels, etc.\n    \"\"\"\n    ulabels: RelatedManager[ULabel]\n    \"\"\"ULabels of this type (can only be non-empty if `is_type` is `True`).\"\"\"\n    description: str | None = TextField(null=True)\n    \"\"\"A description.\"\"\"\n    reference: str | None = CharField(max_length=255, db_index=True, null=True)\n    \"\"\"A simple reference like URL or external ID.\"\"\"\n    reference_type: str | None = CharField(max_length=25, db_index=True, null=True)\n    \"\"\"Type of simple reference.\"\"\"\n    parents: RelatedManager[ULabel] = models.ManyToManyField(\n        \"self\", symmetrical=False, related_name=\"children\"\n    )\n    \"\"\"Parent entities of this ulabel ← :attr:`~lamindb.ULabel.children`.\n\n    For advanced use cases, you can build an ontology under a given `type`.\n\n    Say, if you modeled `CellType` as a `ULabel`, you would introduce a type `CellType` and model the hiearchy of cell types under it.\n    \"\"\"\n    children: RelatedManager[ULabel]\n    \"\"\"Child entities of this ulabel.\n\n    Reverse accessor for parents.\n    \"\"\"\n    transforms: RelatedManager[Transform]\n    \"\"\"The transforms annotated by this ulabel ← :attr:`~lamindb.Transform.ulabels`.\"\"\"\n    runs: RelatedManager[Run]\n    \"\"\"The runs annotated by this ulabel ← :attr:`~lamindb.Run.ulabels`.\"\"\"\n    artifacts: RelatedManager[Artifact] = models.ManyToManyField(\n        \"Artifact\", through=\"ArtifactULabel\", related_name=\"ulabels\"\n    )\n    \"\"\"The artifacts annotated by this ulabel ← :attr:`~lamindb.Artifact.ulabels`.\"\"\"\n    collections: RelatedManager[Collection]\n    \"\"\"The collections annotated by this ulabel ← :attr:`~lamindb.Collection.ulabels`.\"\"\"\n    projects: RelatedManager[Project]\n    \"\"\"The projects annotating this ulabel ← :attr:`~lamindb.Project.ulabels`.\"\"\"\n    branches: RelatedManager[Branch]\n    \"\"\"The branches annotated by this ulabel ← :attr:`~lamindb.Branch.ulabels`.\"\"\"\n    linked_in_records: RelatedManager[Record] = models.ManyToManyField(\n        \"Record\",\n        through=\"RecordULabel\",\n        related_name=\"linked_ulabels\",\n    )\n    \"\"\"Records linking this ulabel as a value ← :attr:`~lamindb.Record.linked_ulabels`.\"\"\"\n    ablocks: RelatedManager[ULabelBlock]\n    \"\"\"Attached blocks ← :attr:`~lamindb.ULabelBlock.ulabel`.\"\"\"\n\n    @overload\n    def __init__(\n        self,\n        name: str,\n        type: ULabel | None = None,\n        is_type: bool = False,\n        description: str | None = None,\n        reference: str | None = None,\n        reference_type: str | None = None,\n    ): ...\n\n    @overload\n    def __init__(\n        self,\n        *db_args,\n    ): ...\n\n    def __init__(\n        self,\n        *args,\n        **kwargs,\n    ):\n        if len(args) == len(self._meta.concrete_fields):\n            super().__init__(*args, **kwargs)\n            return None\n        if len(args) > 0:\n            raise ValueError(\"Only one non-keyword arg allowed\")\n        name: str = kwargs.pop(\"name\", None)\n        type: str | None = kwargs.pop(\"type\", None)\n        is_type: bool = kwargs.pop(\"is_type\", False)\n        description: str | None = kwargs.pop(\"description\", None)\n        reference: str | None = kwargs.pop(\"reference\", None)\n        reference_type: str | None = kwargs.pop(\"reference_type\", None)\n        branch = kwargs.pop(\"branch\", None)\n        branch_id = kwargs.pop(\"branch_id\", 1)\n        space = kwargs.pop(\"space\", None)\n        space_id = kwargs.pop(\"space_id\", 1)\n        _skip_validation = kwargs.pop(\"_skip_validation\", False)\n        _aux = kwargs.pop(\"_aux\", None)\n        if len(kwargs) > 0:\n            valid_keywords = \", \".join([val[0] for val in _get_record_kwargs(ULabel)])\n            raise FieldValidationError(\n                f\"Only {valid_keywords} are valid keyword arguments\"\n            )\n        super().__init__(\n            name=name,\n            type=type,\n            is_type=is_type,\n            description=description,\n            reference=reference,\n            reference_type=reference_type,\n            branch=branch,\n            branch_id=branch_id,\n            space=space,\n            space_id=space_id,\n            _skip_validation=_skip_validation,\n            _aux=_aux,\n        )\n\n    def query_ulabels(self) -> QuerySet:\n        \"\"\"Query ulabels of sub types.\n\n        While `.ulabels` retrieves the ulabels with the current type, this method\n        also retrieves sub types and the ulabels with sub types of the current type.\n        \"\"\"\n        return _query_relatives([self], \"ulabels\")  # type: ignore\n\n\nclass ArtifactULabel(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    artifact: Artifact = ForeignKey(\"Artifact\", CASCADE, related_name=\"links_ulabel\")\n    ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name=\"links_artifact\")\n    feature: Feature | None = ForeignKey(\n        Feature, PROTECT, null=True, related_name=\"links_artifactulabel\", default=None\n    )\n\n    class Meta:\n        # can have the same label linked to the same artifact if the feature is\n        # different\n        app_label = \"lamindb\"\n        unique_together = (\"artifact\", \"ulabel\", \"feature\")\n\n\nclass TransformULabel(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    transform: Transform = ForeignKey(Transform, CASCADE, related_name=\"links_ulabel\")\n    ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name=\"links_transform\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"transform\", \"ulabel\")\n\n\nclass RunULabel(BaseSQLRecord, IsLink):\n    id: int = models.BigAutoField(primary_key=True)\n    run: Run = ForeignKey(Run, CASCADE, related_name=\"links_ulabel\")\n    ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name=\"links_run\")\n    created_at: datetime = DateTimeField(\n        editable=False, db_default=models.functions.Now(), db_index=True\n    )\n    \"\"\"Time of creation of record.\"\"\"\n    created_by: User = ForeignKey(\n        \"lamindb.User\", PROTECT, default=current_user_id, related_name=\"+\"\n    )\n    \"\"\"Creator of record.\"\"\"\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"run\", \"ulabel\")\n\n\nclass BranchULabel(BaseSQLRecord, IsLink):\n    \"\"\"Link model for branch–ulabel association.\"\"\"\n\n    id: int = models.BigAutoField(primary_key=True)\n    branch: Branch = ForeignKey(\"Branch\", CASCADE, related_name=\"links_ulabel\")\n    ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name=\"links_branch\")\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"branch\", \"ulabel\")\n\n\nclass CollectionULabel(BaseSQLRecord, IsLink, TracksRun):\n    id: int = models.BigAutoField(primary_key=True)\n    collection: Collection = ForeignKey(\n        \"Collection\", CASCADE, related_name=\"links_ulabel\"\n    )\n    ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name=\"links_collection\")\n    feature: Feature | None = ForeignKey(\n        Feature, PROTECT, null=True, related_name=\"links_collectionulabel\", default=None\n    )\n\n    class Meta:\n        app_label = \"lamindb\"\n        unique_together = (\"collection\", \"ulabel\")\n"
  },
  {
    "path": "lamindb/py.typed",
    "content": ""
  },
  {
    "path": "lamindb/setup/__init__.py",
    "content": "import lamindb_setup as _lamindb_setup\nfrom lamindb_setup import *  # noqa: F403\nfrom lamindb_setup import (\n    connect,\n    delete,\n    init,\n    settings,\n)\n\nfrom . import core, errors, types\nfrom ._merge import merge  # noqa: F401\nfrom ._switch import switch  # noqa: F401\n\ndel connect  # we have this at the root level, hence, we don't want it here\n__doc__ = _lamindb_setup.__doc__.replace(\"lamindb_setup\", \"lamindb.setup\")\nsettings.__doc__ = settings.__doc__.replace(\"lamindb_setup\", \"lamindb.setup\")\n"
  },
  {
    "path": "lamindb/setup/_merge.py",
    "content": "# Tested in lamin-cli (tests/core/test_create_switch_delete_list_settings.py::test_merge*).\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nimport lamindb_setup as ln_setup\nfrom django.apps import apps\nfrom django.db import connection\nfrom django.db.utils import DatabaseError\nfrom lamin_utils import logger\n\nif TYPE_CHECKING:\n    from lamindb.models import Branch\n\n\ndef merge(branch: str | Branch) -> None:\n    \"\"\"Merge a branch into the current branch.\n\n    All `SQLRecord` objects that have `branch_id` equal to the source branch's id\n    are updated to the current branch's id.\n\n    Find more info in the :class:`~lamindb.Branch` document.\n\n    Args:\n        branch: The source branch to merge from. Accepts a `name`, a `uid`, or the `Branch` object.\n\n    Raises:\n        DoesNotExist: If the branch does not exist.\n    \"\"\"\n    from lamindb import Branch, Q\n    from lamindb.errors import ObjectDoesNotExist\n\n    from ..models import SQLRecord\n    from ..models._is_versioned import IsVersioned, reconcile_is_latest_within_branch\n    from ..models.sqlrecord import BRANCH_SENSITIVE_BLOCK_MODEL_NAMES\n\n    if isinstance(branch, Branch):\n        source = branch\n        if source._state.adding:\n            raise ObjectDoesNotExist(\"Branch must be saved.\")\n    else:\n        source = Branch.filter(Q(name=branch) | Q(uid=branch)).one_or_none()\n        if source is None:\n            raise ObjectDoesNotExist(f\"Branch '{branch}' not found.\")\n\n    current = ln_setup.settings.branch\n    if current.id == source.id:\n        logger.important(\"already on branch, nothing to merge\")\n        return\n\n    sqlrecord_models = [\n        m\n        for m in apps.get_models()\n        if issubclass(m, SQLRecord) and not m._meta.abstract\n    ]\n    attached_block_models = [\n        model\n        for model_name in sorted(BRANCH_SENSITIVE_BLOCK_MODEL_NAMES)\n        if (model := apps.get_model(\"lamindb\", model_name)) is not None\n    ]\n    models = list(dict.fromkeys([*sqlrecord_models, *attached_block_models]))\n    if not models:\n        return\n\n    vendor = connection.vendor\n    quoted_tables = [connection.ops.quote_name(m._meta.db_table) for m in models]\n\n    with connection.cursor() as cursor:\n        if vendor == \"postgresql\":\n            # Single round-trip: one multi-statement execute\n            statements = [\n                f\"UPDATE {tbl} SET branch_id = %s WHERE branch_id = %s\"\n                for tbl in quoted_tables\n            ]\n            sql = \"BEGIN; \" + \"; \".join(statements) + \"; COMMIT;\"\n            params = [current.id, source.id] * len(quoted_tables)\n            try:\n                cursor.execute(sql, params)\n            except DatabaseError as e:\n                logger.error(f\"Merge failed: {e}\")\n                raise\n        else:\n            # SQLite: execute() runs only the first statement; run each UPDATE\n            # in a loop (same connection, so still one transaction if we're inside\n            # a transaction or use autocommit-off).\n            from django.db import transaction\n\n            with transaction.atomic():\n                for tbl in quoted_tables:\n                    # Django uses %s; SQLite backend converts to ?\n                    cursor.execute(\n                        f\"UPDATE {tbl} SET branch_id = %s WHERE branch_id = %s\",\n                        [current.id, source.id],\n                    )\n\n    versioned_models = [m for m in models if issubclass(m, IsVersioned)]\n    for model in versioned_models:\n        reconcile_is_latest_within_branch(model, branch_id=current.id)\n\n    source._status_code = -1  # merged\n    source.save(update_fields=[\"_status_code\"])\n    logger.important(f\"merged branch '{source.name}' into '{current.name}'\")\n"
  },
  {
    "path": "lamindb/setup/_switch.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom lamin_utils import logger\nfrom lamindb_setup import settings\n\nif TYPE_CHECKING:\n    from lamindb.models import Branch\n\n\ndef switch(target: str | Branch, *, space: bool = False, create: bool = False):\n    \"\"\"Switch to a branch or space, create if not exists.\n\n    Args:\n        target: Branch target or space target to switch to.\n        space: If True, switch space; otherwise switch branch.\n        create: If True and switching branch, create the branch if it does not exist.\n    \"\"\"\n    if space:\n        settings.space = target\n    else:\n        if create:\n            from lamindb import Branch, Q\n            from lamindb.errors import BranchAlreadyExists\n\n            # Consistent with git switch -c: error if branch already exists.\n            existing = Branch.filter(Q(name=target) | Q(uid=target)).one_or_none()\n            if existing is not None:\n                raise BranchAlreadyExists(\n                    f\"Branch '{target}' already exists. Omit -c/--create to switch to it.\"\n                )\n            Branch(name=target).save()\n            logger.important(f\"created branch: {target}\")\n        settings.branch = target\n    logger.important(f\"switched to {target}\")\n"
  },
  {
    "path": "lamindb/setup/core/__init__.py",
    "content": "import lamindb_setup as _lamindb_setup\nfrom lamindb_setup.core import *  # noqa: F403\n\n__doc__ = _lamindb_setup.core.__doc__.replace(\"lamindb_setup\", \"lamindb.setup\")\n"
  },
  {
    "path": "lamindb/setup/errors/__init__.py",
    "content": "import lamindb_setup as _lamindb_setup\nfrom lamindb_setup.errors import *  # noqa: F403\n\n__doc__ = _lamindb_setup.errors.__doc__.replace(\"lamindb_setup\", \"lamindb.setup\")\n"
  },
  {
    "path": "lamindb/setup/types/__init__.py",
    "content": "import lamindb_setup as _lamindb_setup\nfrom lamindb_setup.types import *  # noqa: F403\n\n__doc__ = _lamindb_setup.types.__doc__.replace(\"lamindb_setup\", \"lamindb.setup\")\n"
  },
  {
    "path": "lamindb_full.py",
    "content": "\"\"\"Full/meta-package module for the `lamindb` distribution.\"\"\"\n\nfrom __future__ import annotations\n\nimport re\nfrom pathlib import Path\n\n_INIT_FILE = Path(__file__).parent / \"lamindb\" / \"__init__.py\"\n_MATCH = re.search(r'__version__\\s*=\\s*\"([^\"]+)\"', _INIT_FILE.read_text())\nif _MATCH is None:\n    raise RuntimeError(f\"Could not parse __version__ from {_INIT_FILE}\")\n\n__version__ = _MATCH.group(1)\n"
  },
  {
    "path": "noxfile.py",
    "content": "import os\nimport shutil\nfrom pathlib import Path\n\nimport nox\nfrom laminci import convert_executable_md_files, upload_docs_artifact\nfrom laminci.nox import (\n    build_docs,\n    login_testuser1,\n    login_testuser2,\n    run,\n    run_pre_commit,\n)\n\n# we'd like to aggregate coverage information across sessions\n# and for this the code needs to be located in the same\n# directory in every github action runner\n# this also allows to break out an installation section\nnox.options.default_venv_backend = \"none\"\n\nIS_PR = os.getenv(\"GITHUB_EVENT_NAME\") != \"push\"\nCI = os.environ.get(\"CI\")\n# SpatialData.write() regression with ome-zarr>=0.14:\n# https://github.com/scverse/spatialdata/issues/1090\nSPATIALDATA_OME_ZARR_CONSTRAINT = \"ome-zarr<0.14.0\"\n\n\nGROUPS = {}\nGROUPS[\"tutorial\"] = [\n    \"README.ipynb\",\n    \"sync.ipynb\",\n    \"arrays.ipynb\",\n    \"registries.ipynb\",\n]\nGROUPS[\"guide\"] = [\n    \"track.ipynb\",\n]\nGROUPS[\"tiledbsoma\"] = [\n    \"curate.ipynb\",\n]\nGROUPS[\"biology\"] = [\n    \"manage-ontologies.ipynb\",\n]\n\n\n@nox.session\ndef lint(session: nox.Session) -> None:\n    run_pre_commit(session)\n\n\n@nox.session\ndef install(session):\n    base_deps = [\n        \"./sub/lamin-cli\",\n        \"./sub/lamindb-setup\",\n        \"./sub/bionty\",\n    ]\n    top_deps = [\n        \".[full,dev]\",\n    ]\n    cmds = [\n        f\"uv pip install {'--system' if CI else ''} --no-cache-dir {' '.join(base_deps)}\",\n    ] + [\n        f\"uv pip install {'--system' if CI else ''} --no-cache-dir -e {dep}\"\n        for dep in top_deps\n    ]\n    [run(session, line) for line in cmds]\n\n\n@nox.session\n@nox.parametrize(\n    \"group\",\n    [\n        \"unit-core-sqlite\",\n        \"unit-core-postgres\",\n        \"unit-storage\",\n        \"no-instance\",\n        \"tutorial\",\n        \"guide\",\n        \"tiledbsoma\",\n        \"biology\",\n        \"faq\",\n        \"storage\",\n        \"curator\",\n        \"integrations\",\n        \"docs\",\n        \"cli\",\n        \"permissions\",\n    ],\n)\ndef install_ci(session, group):\n    extras = \"\"\n    if group in [\"unit-core-sqlite\", \"unit-core-postgres\"]:\n        extras += \"fcs\"\n        run(session, \"uv pip install --system scanpy\")\n        run(session, \"uv pip install --system mudata\")\n        # spatialdata dependency, specifying it here explicitly\n        # otherwise there are problems with uv resolver\n        run(session, \"uv pip install --system xarray-dataclasses\")\n        run(\n            session,\n            f\"uv pip install --system spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT}\",\n        )\n    elif group == \"unit-storage\":\n        extras += \"gcp\"\n        run(session, \"uv pip install --system huggingface_hub\")\n        run(session, \"uv pip install --system scanpy\")\n        run(session, \"uv pip install --system polars\")\n    elif group == \"tutorial\":\n        # anndata here to prevent installing older version on release\n        run(session, \"uv pip install --system huggingface_hub polars anndata==0.12.2\")\n    elif group == \"guide\":\n        extras += \"zarr_v2\"\n        run(\n            session,\n            f\"uv pip install --system scanpy mudata spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT}\",\n        )\n    elif group == \"tiledbsoma\":\n        extras += \"zarr_v2\"\n        run(\n            session,\n            f\"uv pip install --system scanpy mudata spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT} tiledbsoma\",\n        )\n    elif group == \"biology\":\n        extras += \"fcs\"\n        run(session, \"uv pip install --system ipywidgets\")\n    elif group == \"faq\":\n        extras += \"zarr_v2\"\n    elif group == \"storage\":\n        extras += \"zarr_v2\"\n        run(\n            session,\n            \"uv pip install --system --no-deps ./sub/pertdb\",\n        )\n        run(session, \"uv pip install --system vitessce\")\n    elif group == \"curator\":\n        run(\n            session,\n            \"uv pip install --system --no-deps ./sub/pertdb\",\n        )\n        # spatialdata dependency, specifying it here explicitly\n        # otherwise there are problems with uv resolver\n        run(session, \"uv pip install --system xarray-dataclasses\")\n        run(\n            session,\n            f\"uv pip install --system spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT}\",\n        )\n    elif group == \"integrations\":\n        run(session, \"uv pip install --system lightning\")\n    elif group == \"docs\":\n        extras += \"zarr_v2\"\n        # spatialdata dependency, specifying it here explicitly\n        # otherwise there are problems with uv resolver\n        run(session, \"uv pip install --system xarray-dataclasses\")\n        run(\n            session,\n            f\"uv pip install --system mudata spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT} lightning\",\n        )\n        run(\n            session,\n            \"uv pip install --system --no-deps ./sub/pertdb\",\n        )\n    elif group == \"cli\":\n        pass\n    elif group == \"permissions\":\n        pass\n\n    extras = \",\" + extras if extras != \"\" else extras\n    run(session, f\"uv pip install --system -e .[full,dev{extras}]\")\n\n    # on the release branch, do not use submodules but run with pypi install\n    # only exception is the docs group which should always use the submodule\n    # to push docs fixes fast\n    # installing this after lamindb to be sure that these packages won't be reinstaled\n    # during lamindb installation\n    if IS_PR or group == \"docs\":\n        run(\n            session,\n            \"uv pip install --system ./sub/lamindb-setup ./sub/lamin-cli ./sub/bionty ./sub/pertdb\",\n        )\n    if group == \"permissions\":\n        # have to install after lamindb installation\n        # because lamindb downgrades django required by laminhub_rest\n        cmds = \"uv pip install --system ./laminhub/backend\"\n        cmds += \"\\nuv pip install --system ./laminhub/backend/utils\"\n        cmds += \"\\nuv pip install --system ./laminhub/backend/services/central\"\n        cmds += \"\\nuv pip install --system ./laminhub/backend/services/instancedb\"\n        cmds += \"\\nuv pip install --system ./laminhub/backend/services/aws\"\n        cmds += \"\\nuv pip install --system --no-deps ./laminhub/backend/services/instancedb/hubmodule\"\n        [run(session, line) for line in cmds.splitlines()]\n\n\n@nox.session\ndef configure_coverage(session) -> None:\n    \"\"\"Write a coverage config file, adding extra patterns to omit.\"\"\"\n    import tomlkit\n\n    groups_str = session.posargs[0]  # first positional argument\n\n    print(groups_str)  # for debugging\n    # so that we don't change this away from string\n    assert isinstance(groups_str, str)  # noqa: S101\n\n    if \"curator\" not in groups_str and \"tiledbsoma\" not in groups_str:\n        extra_omit_patterns = [\"**/curators/*\"]\n    else:\n        extra_omit_patterns = []\n\n    # Read patterns from pyproject.toml\n    base_config_path = Path(\"pyproject.toml\")\n    with open(base_config_path) as f:\n        config = tomlkit.load(f)\n\n    # Update the omit patterns\n    base_patterns = config[\"tool\"][\"coverage\"][\"run\"][\"omit\"]\n    all_patterns = base_patterns + extra_omit_patterns\n    config[\"tool\"][\"coverage\"][\"run\"][\"omit\"] = all_patterns\n\n    # Write back to pyproject.toml\n    with open(base_config_path, \"w\") as f:\n        tomlkit.dump(config, f)\n\n    print(base_config_path.read_text())\n\n\n@nox.session\ndef prepare(session):\n    \"\"\"Create executable files to run during a test session.\n\n    Is not needed for unit tests!\n    \"\"\"\n    content = open(\"README.md\").read()\n    # cannot execute the flow after ln.track() was called\n    content = content.replace(\"    create_fasta()\", \"    pass\")\n    open(\"README_stripped.md\", \"w\").write(\n        \"\\n\".join(\n            line\n            for line in content.split(\"\\n\")\n            if not line.strip().startswith(\n                (\"accessor = artifact.open()\", \"ln.track(project=\", \"ln.Project(name=\")\n            )\n        )\n    )\n\n    os.system(\"jupytext README_stripped.md --to notebook --output ./docs/README.ipynb\")\n    convert_executable_md_files()\n    os.system(\"cp ./tests/core/test_artifact_parquet.py ./docs/scripts/\")\n    os.system(\"cp ./lamindb/examples/schemas/define_valid_features.py ./docs/scripts/\")\n    os.system(\n        \"cp ./lamindb/examples/schemas/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py ./docs/scripts/\"\n    )\n    os.system(\n        \"cp ./lamindb/examples/datasets/define_mini_immuno_features_labels.py ./docs/scripts/\"\n    )\n    os.system(\n        \"cp ./lamindb/examples/datasets/define_mini_immuno_schema_flexible.py ./docs/scripts/\"\n    )\n    os.system(\n        \"cp ./lamindb/examples/datasets/save_mini_immuno_datasets.py ./docs/scripts/\"\n    )\n\n\n@nox.session\n@nox.parametrize(\n    \"group\",\n    [\n        \"unit-core-sqlite\",\n        \"unit-core-postgres\",\n        \"unit-storage\",\n        \"no-instance\",\n        \"curator\",\n        \"integrations\",\n        \"tutorial\",\n        \"guide\",\n        \"tiledbsoma\",\n        \"biology\",\n        \"faq\",\n        \"storage\",\n        \"cli\",\n        \"permissions\",\n    ],\n)\ndef test(session, group):\n    # we likely don't need auth in many other groups, but have to carefully expand this\n    if group not in {\"curator\", \"no-instance\"}:\n        login_testuser2(session)\n        login_testuser1(session)\n    # this is mostly needed for the docs so that we don't render Django's entire public API\n    run(session, \"lamin settings set private-django-api true\")\n    coverage_args = \"--cov=lamindb --cov-config=pyproject.toml --cov-append --cov-report=term-missing\"\n    duration_args = \"--durations=10\"\n\n    env = os.environ.copy()\n    if group == \"unit-core-sqlite\":\n        env[\"LAMINDB_TEST_DB_VENDOR\"] = \"sqlite\"\n        run(\n            session,\n            f\"pytest {coverage_args} ./tests/core {duration_args}\",\n            env=env,\n        )\n    elif group == \"unit-core-postgres\":\n        env[\"LAMINDB_TEST_DB_VENDOR\"] = \"postgresql\"\n        run(\n            session,\n            f\"pytest {coverage_args} ./tests/core {duration_args}\",\n            env=env,\n        )\n    elif group == \"unit-storage\":\n        login_testuser2(session)  # shouldn't be necessary but is for now\n        run(session, f\"pytest {coverage_args} ./tests/storage {duration_args}\")\n    elif group == \"no-instance\":\n        run(session, \"lamin disconnect\")\n        run(session, f\"pytest {coverage_args} ./tests/no_instance {duration_args}\")\n    elif group == \"tutorial\":\n        run(session, \"lamin logout\")\n        run(session, \"lamin init --storage ./test-readme --modules bionty\")\n        run(\n            session, f\"pytest -s {coverage_args} ./docs/test_notebooks.py::test_{group}\"\n        )\n    elif group == \"guide\":\n        run(\n            session,\n            f\"pytest -s {coverage_args} ./docs/test_notebooks.py::test_{group}\",\n        )\n    elif group == \"tiledbsoma\":\n        run(\n            session,\n            (\n                f\"pytest {coverage_args} tests/tiledbsoma \"\n                \"./docs/test_notebooks.py::test_tiledbsoma \"\n                f\"{duration_args}\"\n            ),\n        )\n    elif group == \"biology\":\n        run(\n            session,\n            f\"pytest -s {coverage_args} ./docs/test_notebooks.py::test_{group}\",\n        )\n    elif group == \"faq\":\n        run(session, f\"pytest -s {coverage_args} ./docs/faq\")\n    elif group == \"storage\":\n        run(session, f\"pytest -s {coverage_args} ./docs/storage\")\n    elif group == \"curator\":\n        run(\n            session,\n            f\"pytest {coverage_args} tests/curators {duration_args}\",\n        )\n    elif group == \"integrations\":\n        run(session, f\"pytest -s {coverage_args} tests/integrations\")\n    elif group == \"cli\":\n        run(\n            session,\n            f\"pytest {coverage_args} ./sub/lamin-cli/tests/core {duration_args}\",\n        )\n    elif group == \"permissions\":\n        run(session, f\"pytest {coverage_args} ./tests/permissions\")\n    # move artifacts into right place\n    if group in {\"tutorial\", \"guide\", \"tiledbsoma\", \"biology\"}:\n        target_dir = Path(f\"./docs/{group}\")\n        target_dir.mkdir(exist_ok=True)\n        for filename in GROUPS[group]:\n            shutil.copy(Path(\"docs\") / filename, target_dir / filename)\n\n\n@nox.session\ndef clidocs(session):\n    def generate_cli_docs():\n        os.environ[\"NO_RICH\"] = \"1\"\n        from lamin_cli.__main__ import COMMAND_GROUPS, _generate_help\n\n        page = \"# CLI\\n\\n\"\n        helps = _generate_help()\n\n        # First, add the main lamin command\n        main_help = helps.get(\"main\")\n        if main_help:\n            help_string = main_help[\"help\"].replace(\"Usage: main\", \"Usage: lamin\")\n            help_docstring = main_help[\"docstring\"]\n            if help_docstring:\n                page += f\"{help_docstring}\\n\\n\"\n            # below is ugly\n            # page += f\"```text\\n{help_string}\\n```\\n\\n\"\n\n        # Create a mapping of command names to their full keys in helps\n        command_to_key = {}\n        for name in helps.keys():\n            names = name.split(\" \")\n            if len(names) == 2:  # e.g., \"lamin connect\"\n                command_name = names[1]\n                command_to_key[command_name] = name\n\n        # Group commands by their categories\n        command_groups = COMMAND_GROUPS.get(\"lamin\", [])\n        processed_commands = set()\n\n        for group in command_groups:\n            group_name = group[\"name\"]\n            group_commands = group[\"commands\"]\n\n            page += f\"## {group_name}\\n\\n\"\n\n            for command_name in group_commands:\n                if command_name in command_to_key:\n                    full_key = command_to_key[command_name]\n                    help_dict = helps[full_key]\n                    processed_commands.add(command_name)\n\n                    help_string = help_dict[\"help\"].replace(\"Usage: main\", \"lamin\")\n                    help_docstring = help_dict[\"docstring\"]\n\n                    pyr_alt_delimiter = \"→ Python/R alternative:\"\n\n                    if pyr_alt_delimiter in help_docstring:\n                        help_docstring, pyr_alt_string = help_docstring.split(\n                            pyr_alt_delimiter\n                        )\n                    else:\n                        pyr_alt_string = \"\"\n\n                    page += f\"### {command_name}\\n\\n\"\n                    if help_docstring:\n                        page += f\"{help_docstring}\\n\"\n                    command_block = f\"```text\\n{help_string}\\n```\"\n                    page += f\"\\n\\nOptions:\\n\\n{command_block}\\n\\n\"\n                    if pyr_alt_string:\n                        page += f\"{pyr_alt_delimiter}{pyr_alt_string}\\n\\n\"\n\n        # Add any remaining commands that aren't in groups\n        remaining_commands = []\n        for command_name, full_key in command_to_key.items():\n            if command_name not in processed_commands:\n                remaining_commands.append((command_name, full_key))\n\n        if remaining_commands:\n            page += \"## Other\\n\\n\"\n            for command_name, full_key in remaining_commands:\n                help_dict = helps[full_key]\n                help_string = help_dict[\"help\"].replace(\"Usage: main\", \"Usage: lamin\")\n                help_docstring = help_dict[\"docstring\"]\n\n                page += f\"### lamin {command_name}\\n\\n\"\n                if help_docstring:\n                    page += f\"{help_docstring}\\n\\n\"\n                page += f\"```text\\n{help_string}\\n```\\n\\n\"\n\n        Path(\"./docs/cli.md\").write_text(page)\n\n    generate_cli_docs()\n\n\n@nox.session\ndef docs(session):\n    # move artifacts into right place\n    run(session, \"lamin settings set private-django-api true\")\n    for group in [\"tutorial\", \"guide\", \"tiledbsoma\", \"biology\", \"faq\", \"storage\"]:\n        if Path(f\"./docs-{group}\").exists():\n            if Path(f\"./docs/{group}\").exists():\n                shutil.rmtree(f\"./docs/{group}\")\n            Path(f\"./docs-{group}\").rename(f\"./docs/{group}\")\n        # move back to root level\n        if group in {\"tutorial\", \"guide\", \"tiledbsoma\", \"biology\"}:\n            for path in Path(f\"./docs/{group}\").glob(\"*\"):\n                path.rename(f\"./docs/{path.name}\")\n    run(\n        session,\n        \"lamin init --storage ./docsbuild --modules bionty,pertdb\",\n    )\n    build_docs(session, strip_prefix=True, strict=False)\n    upload_docs_artifact()\n"
  },
  {
    "path": "pyproject.full.toml",
    "content": "[build-system]\nrequires = [\"flit_core >=3.2,<4\"]\nbuild-backend = \"flit_core.buildapi\"\n\n[project]\nname = \"lamindb\"\nrequires-python = \">=3.10,<=3.14\"\nauthors = [{name = \"Lamin Labs\", email = \"open-source@lamin.ai\"}]\nreadme = \"README.md\"\ndynamic = [\"version\", \"description\"]\nclassifiers = [\n    \"Programming Language :: Python :: 3.10\",\n    \"Programming Language :: Python :: 3.11\",\n    \"Programming Language :: Python :: 3.12\",\n    \"Programming Language :: Python :: 3.13\",\n]\ndependencies = [\n    \"lamindb-core[full]==2.4.2\",\n]\n\n[project.urls]\nHome = \"https://github.com/laminlabs/lamindb\"\n\n[project.optional-dependencies]\ngcp = [\n    \"lamindb_setup[gcp]\",\n]\nzarr_v2 = [\n    \"numcodecs<0.16.0\", # 0.16.0 breaks zarr<3.0.*\n    \"zarr>=2.16.0,<3.0.0a0\", # not yet compatible with 3.0.*\n]\nfcs = [\n    \"readfcs>=2.0.1\",\n]\ndev = [\n    # basic test\n    \"tomlkit\",\n    \"line_profiler\",\n    \"pre-commit\",\n    \"nox\",\n    \"laminci>=0.3\",\n    \"pytest>=6.0\",\n    \"coverage\",\n    \"pytest-cov<7.0.0\",  # v7 drops support for subprocess measurement\n    \"mudata\",\n    # others\n    \"nbproject_test>=0.6.0\",\n    # biology\n    \"faker-biology\",\n    # bionty\n    \"pronto\",\n]\n\n[tool.flit.module]\nname = \"lamindb_full\"\n\n[tool.flit.sdist]\nexclude = [\n    \"sub/\"\n]\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[build-system]\nrequires = [\"flit_core >=3.2,<4\"]\nbuild-backend = \"flit_core.buildapi\"\n\n[project]\nname = \"lamindb-core\"\nrequires-python = \">=3.10,<=3.14\"\nauthors = [{name = \"Lamin Labs\", email = \"open-source@lamin.ai\"}]\nreadme = \"README.md\"\ndynamic = [\"version\", \"description\"]\nclassifiers = [\n    \"Programming Language :: Python :: 3.10\",\n    \"Programming Language :: Python :: 3.11\",\n    \"Programming Language :: Python :: 3.12\",\n    \"Programming Language :: Python :: 3.13\",\n]\ndependencies = [\n    \"lamin_utils==0.16.4\",  # no dependencies\n    \"lamin_cli==1.16.0\",  # no dependencies\n    \"lamindb_setup[aws]==1.25a1\",  # dependencies like Django & fsspec\n    \"psycopg2-binary\",\n]\n\n[project.urls]\nHome = \"https://github.com/laminlabs/lamindb\"\n\n[project.optional-dependencies]\n# full: keep in sync with pyproject.full.toml dependencies (excluding lamindb-core).\n# If you change duplicated deps here, update pyproject.full.toml too, and vice versa.\nfull = [\n    # LaminDB optional modules, included to avoid users forgetting about extras\n    \"bionty>=2.3.1,<3\",  # 30kB pure python, no dependencies\n    \"pertdb>=2.2.0,<3\",  # 30kB pure python, no dependencies\n    # Jupyter -- small packages with few & small dependencies\n    \"jupytext\",\n    \"nbconvert>=7.2.1\",  # bound to avoid lxml[html_clean] dependency\n    \"nbproject==0.11.1\",  # adds orjson\n    # Data & validation dependencies (heavier)\n    \"pyarrow\",\n    \"pandera>=0.24.0\",\n    \"pandas>=2.0.0,<3.0.0\",  # for .infer_objects(copy=False) in lamin-utils; not yet compatible with Pandas 3.0.0\n    \"anndata>=0.10.0,<=0.12.10\",  # backed sparse is incompatible with scipy 1.15.0 for anndata 1.11.1\n    # Runtime utilities\n    \"graphviz\",\n    \"scipy<1.17.0\",  # 1.17.0 is incompatible with anndata<0.12.7\n    \"pyyaml\",\n    \"typing_extensions!=4.6.0\",\n    \"python-dateutil\",\n]\ngcp = [\n    \"lamindb_setup[gcp]\",\n]\nzarr_v2 = [\n    \"numcodecs<0.16.0\", # 0.16.0 breaks zarr<3.0.*\n    \"zarr>=2.16.0,<3.0.0a0\", # not yet compatible with 3.0.*\n]\nfcs = [\n    \"readfcs>=2.0.1\",\n]\ndev = [\n    # basic test\n    \"tomlkit\",\n    \"line_profiler\",\n    \"pre-commit\",\n    \"nox\",\n    \"laminci>=0.3\",\n    \"pytest>=6.0\",\n    \"coverage\",\n    \"pytest-cov<7.0.0\",  # v7 drops support for subprocess measurement\n    \"mudata\",\n    # others\n    \"nbproject_test>=0.6.0\",\n    # biology\n    \"faker-biology\",\n    # bionty\n    \"pronto\",\n]\n\n[tool.flit.module]\nname = \"lamindb\"\n\n[tool.ruff]\nsrc = [\"src\"]\nline-length = 88\nlint.select = [\n    \"F\",  # Errors detected by Pyflakes\n    \"E\",  # Error detected by Pycodestyle\n    \"W\",  # Warning detected by Pycodestyle\n    \"I\",  # isort\n    \"D\",  # pydocstyle\n    \"B\",  # flake8-bugbear\n    \"TID\",  # flake8-tidy-imports\n    \"C4\",  # flake8-comprehensions\n    \"BLE\",  # flake8-blind-except\n    \"UP\",  # pyupgrade\n    \"RUF100\",  # Report unused noqa directives\n    \"TCH\",  # Typing imports\n    \"NPY\",  # Numpy specific rules\n    \"PTH\",  # Use pathlib\n    \"S\"  # Security\n]\nlint.ignore = [\n    # Do not catch blind exception: `Exception`\n    \"BLE001\",\n    # Errors from function calls in argument defaults. These are fine when the result is immutable.\n    \"B008\",\n    # line too long -> we accept long comment lines; black gets rid of long code lines\n    \"E501\",\n    # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient\n    \"E731\",\n    # allow I, O, l as variable names -> I is the identity matrix\n    \"E741\",\n    # Missing docstring in public module\n    \"D100\",\n    # undocumented-public-class\n    \"D101\",\n    # Missing docstring in public method\n    \"D102\",\n    # Missing docstring in public function\n    \"D103\",\n    # Missing docstring in public package\n    \"D104\",\n    # __magic__ methods are are often self-explanatory, allow missing docstrings\n    \"D105\",\n    # Missing docstring in public nested class\n    \"D106\",\n    # Missing docstring in __init__\n    \"D107\",\n    \"D405\",\n    \"D214\",\n    \"D416\",\n    ## Disable one in each pair of mutually incompatible rules\n    # We don’t want a blank line before a class docstring\n    \"D203\",\n    # 1 blank line required after class docstring\n    \"D204\",\n    # first line should end with a period [Bug: doesn't work with single-line docstrings]\n    # We want docstrings to start immediately after the opening triple quote\n    \"D213\",\n    # Section underline is over-indented (\"{name}\")\n    \"D215\",\n    # First line should be in imperative mood; try rephrasing\n    \"D401\",\n    # First word of the first line should be capitalized: {} -> {}\n    \"D403\",\n    # First word of the docstring should not be \"This\"\n    \"D404\",\n    # Section name should end with a newline (\"{name}\")\n    \"D406\",\n    # Missing dashed underline after section (\"{name}\")\n    \"D407\",\n    # Section underline should be in the line following the section's name (\"{name}\")\n    \"D408\",\n    # Section underline should match the length of its name (\"{name}\")\n    \"D409\",\n    # No blank lines allowed between a section header and its content (\"{name}\")\n    \"D412\",\n    # Missing blank line after last section (\"{name}\")\n    \"D413\",\n    # Missing argument description in the docstring\n    \"D417\",\n    # camcelcase imported as lowercase\n    \"N813\",\n    # module import not at top level of file\n    \"E402\",\n    # open()` should be replaced by `Path.open()\n    \"PTH123\",\n    # subprocess` call: check for execution of untrusted input - https://github.com/PyCQA/bandit/issues/333\n    \"S603\",\n    # Starting a process with a partial executable path\n    \"S607\",\n    # Prefer absolute imports over relative imports from parent modules\n    \"TID252\",\n    # Asserts\n    \"S101\",\n    # Standard pseudo-random generators are not suitable for cryptographic purposes\n    \"S311\",\n    # Starting a process with a shell: seems safe, but may be changed in the future; consider rewriting without `shell`\n    \"S605\",\n    # Possible SQL injection vector through string-based query construction\n    \"S608\",\n    # All of the below TODO 3.10 refactor, temporarily disable\n    \"S602\",\n    \"UP007\",\n    \"UP038\",\n    \"B905\",\n    \"UP035\",\n    \"RUF100\",\n]\n\n[tool.ruff.lint.pydocstyle]\nconvention = \"google\"\n\n[tool.ruff.lint.per-file-ignores]\n\"docs/*\" = [\"I\", \"S101\"]\n\"tests/**/*.py\" = [\n    \"D\",  # docstrings are allowed to look a bit off\n    \"S101\", # asserts allowed in tests...\n    \"ARG\", # Unused function args -> fixtures nevertheless are functionally relevant...\n    \"FBT\", # Don't care about booleans as positional arguments in tests, e.g. via @pytest.mark.parametrize()\n    \"PLR2004\", # Magic value used in comparison, ...\n    \"S311\", # Standard pseudo-random generators are not suitable for cryptographic purposes\n]\n\"tests/**/*.ipynb\" = [\"S101\"]\n\"*/__init__.py\" = [\"F401\"]\n\"lamindb/core/types.py\" = [\"F401\"]\n\n[tool.pytest.ini_options]\ntestpaths = [\n    \"tests\",\n]\nfilterwarnings = [\n    \"ignore::SyntaxWarning:pronto\",\n    \"ignore:::pronto.ontology\",\n    \"ignore::UserWarning:xarray_schema\",\n    \"ignore::DeprecationWarning:botocore.*\",\n    \"ignore::DeprecationWarning:xarray_schema\",\n    \"ignore::DeprecationWarning:geopandas\",\n    \"ignore::DeprecationWarning:tiledbsoma\",\n    \"ignore::DeprecationWarning:pkg_resources\",\n    \"ignore::FutureWarning:spatialdata\",\n    \"ignore::FutureWarning:mudata\",\n    \"ignore::UserWarning:anndata\",\n    \"ignore:Jupyter is migrating its paths to use standard platformdirs:DeprecationWarning\",\n    \"ignore:The 'train_dataloader' does not have many workers:UserWarning\",\n]\nmarkers = [\n    \"pg_integration: tests that require an external PostgreSQL instance\"\n]\n\n[tool.coverage.report]\nexclude_lines = [\n    \"if TYPE_CHECKING:\",\n    \"@abstractmethod\",\n    \"@abc.abstractmethod\"\n]\n\n[tool.coverage.run]\nomit = [\"**/examples/datasets/*\", \"**/migrations/*\", \"**/curators/_legacy.py\", \"**/core/_compat.py\", \"**/core/types.py\"]\n\n[tool.flit.sdist]\nexclude = [\n    \"sub/\"\n]\n"
  },
  {
    "path": "scripts/migrate_test_instances.py",
    "content": "#!/usr/bin/env python3\n\"\"\"Migrate all LaminDB instances used in lamindb tests.\n\nFor each instance: connect, run migrations, create storage snapshot.\nRun from repo root with: python scripts/migrate_test_instances.py\n\"\"\"\n\nimport subprocess\nimport sys\n\nINSTANCES = [\n    \"laminlabs/lamin-site-assets\",\n    \"laminlabs/lamin-dev\",\n    \"laminlabs/lamindata\",\n    \"laminlabs/cellxgene\",\n    \"laminlabs/bionty-assets\",\n    \"laminlabs/pertdata\",\n]\n\n\ndef run(cmd: str) -> None:\n    result = subprocess.run(cmd, shell=True)\n    if result.returncode != 0:\n        sys.exit(result.returncode)\n\n\ndef main() -> None:\n    for instance in INSTANCES:\n        print(f\"=== Migrating {instance} ===\")\n        run(f\"lamin connect {instance}\")\n        run(\"lamin migrate deploy\")\n        run(\"lamin io snapshot\")\n        print()\n\n    print(\"Done. All test instances migrated and snapshotted.\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "tests/core/_dataset_fixtures.py",
    "content": "from pathlib import Path\nfrom typing import Generator\n\nimport lamindb as ln\nimport numpy as np\nimport pandas as pd\nimport pytest\nfrom scipy.sparse import csr_matrix\n\n\n@pytest.fixture(scope=\"session\")\ndef get_small_adata():\n    # shouldn't need anndata installed to run tests\n    import anndata as ad\n\n    return ad.AnnData(\n        X=np.array([[1, 2, 3], [4, 5, 6]]),\n        obs={\"feat1\": [\"A\", \"B\"]},\n        var=pd.DataFrame(index=[\"MYC\", \"TCF7\", \"GATA1\"]),\n        obsm={\"X_pca\": np.array([[1, 2], [3, 4]])},\n    )\n\n\n@pytest.fixture(scope=\"session\")\ndef get_small_mdata():\n    # shouldn't need mudata installed to run tests\n    import anndata as ad\n    import mudata as md\n\n    adata1 = ad.AnnData(\n        X=np.array([[1, 2, 3], [4, 5, 6]]),\n        obs={\"feat1\": [\"A\", \"B\"]},\n        var=pd.DataFrame(index=[\"MYC\", \"TCF7\", \"GATA1\"]),\n        obsm={\"X_pca\": np.array([[1, 2], [3, 4]])},\n    )\n\n    adata2 = ad.AnnData(\n        X=np.array([[7, 8], [9, 10]]),\n        obs={\"feat2\": [\"C\", \"D\"]},\n        var=pd.DataFrame(index=[\"FOXP3\", \"CD8A\"]),\n        obsm={\"X_umap\": np.array([[5, 6], [7, 8]])},\n    )\n\n    return md.MuData({\"rna\": adata1, \"protein\": adata2})\n\n\n@pytest.fixture(scope=\"session\")\ndef get_small_sdata():\n    # shouldn't need spatialdata installed to run tests\n    import anndata as ad\n    import spatialdata as sd\n\n    adata = ad.AnnData(\n        X=csr_matrix(np.array([[0.1, 0.2], [0.3, 0.4]])),\n        obs=pd.DataFrame(index=[\"cell1\", \"cell2\"]),\n        var=pd.DataFrame(index=[\"gene1\", \"gene2\"]),\n    )\n\n    {\n        \"region1\": np.array([[[0, 0], [0, 1], [1, 1], [1, 0]]]),\n        \"region2\": np.array([[[2, 2], [2, 3], [3, 3], [3, 2]]]),\n    }\n\n    sdata_obj = sd.SpatialData(\n        tables={\"gene_expression\": adata},\n    )\n\n    return sdata_obj\n\n\n@pytest.fixture(scope=\"session\")\ndef get_mini_csv() -> Generator[Path, None, None]:\n    csv_path = ln.examples.datasets.file_mini_csv()\n    yield csv_path\n\n    Path(\"mini.csv\").unlink(missing_ok=True)\n"
  },
  {
    "path": "tests/core/conftest.py",
    "content": "import os\nimport shutil\nfrom pathlib import Path\nfrom subprocess import DEVNULL, run\nfrom time import perf_counter\n\nimport anndata as ad\nimport lamindb as ln\nimport lamindb_setup as ln_setup\nimport numpy as np\nimport pandas as pd\nimport pytest\n\n# for artifact fixtures\nimport yaml  # type: ignore\nfrom lamin_utils import logger\nfrom laminci.db import setup_local_test_postgres\n\n\ndef pytest_sessionstart():\n    t_execute_start = perf_counter()\n    ln_setup._TESTING = True\n    os.environ[\"LAMIN_TESTING\"] = \"true\"\n    is_postgresql = os.getenv(\"LAMINDB_TEST_DB_VENDOR\") == \"postgresql\"\n    if is_postgresql:\n        print(\"running tests on PostgreSQL\")\n    else:\n        os.environ[\"LAMINDB_TEST_DB_VENDOR\"] = \"sqlite\"\n        print(\"running tests on SQLite\")\n    if is_postgresql is False:\n        ln.setup.init(\n            storage=\"./default_storage_unit_core\",\n            modules=\"bionty\",\n            name=\"lamindb-unit-tests-core\",\n        )\n    else:\n        try:\n            pgurl = setup_local_test_postgres()\n        except RuntimeError:\n            run(\"docker stop pgtest && docker rm pgtest\", shell=True, stdout=DEVNULL)  # noqa: S602\n            pgurl = setup_local_test_postgres()\n        ln.setup.init(\n            storage=\"./default_storage_unit_core\",\n            modules=\"bionty\",\n            name=\"lamindb-unit-tests-core\",\n            db=pgurl,\n        )\n\n    ln.settings.creation.artifact_silence_missing_run_warning = True\n    total_time_elapsed = perf_counter() - t_execute_start\n    print(f\"time to setup the instance: {total_time_elapsed:.1f}s\")\n\n\ndef pytest_sessionfinish(session: pytest.Session):\n    logger.set_verbosity(1)\n    shutil.rmtree(\"./default_storage_unit_core\")\n    ln.setup.delete(\"lamindb-unit-tests-core\", force=True)\n    del os.environ[\"LAMIN_TESTING\"]\n    if not os.getenv(\"LAMINDB_TEST_DB_VENDOR\") == \"sqlite\":\n        run(\"docker stop pgtest && docker rm pgtest\", shell=True, stdout=DEVNULL)  # noqa: S602\n\n\n@pytest.fixture\ndef ccaplog(caplog) -> pytest.LogCaptureFixture:\n    \"\"\"Add caplog handler to our custom logger at session start.\"\"\"\n    from lamin_utils._logger import logger\n\n    logger.addHandler(caplog.handler)\n\n    yield caplog\n\n    logger.removeHandler(caplog.handler)\n\n\n@pytest.fixture(\n    scope=\"function\",\n    params=[\n        # tuple of is_in_registered_storage, path, suffix, hash of test_dir\n        (True, \"./default_storage_unit_core/\", \".csv\", \"iGtHiFEBV3r1_TFovdQCgw\"),\n        (True, \"./default_storage_unit_core/\", \"\", \"iGtHiFEBV3r1_TFovdQCgw\"),\n        (True, \"./registered_storage/\", \".csv\", \"iGtHiFEBV3r1_TFovdQCgw\"),\n        (True, \"./registered_storage/\", \"\", \"iGtHiFEBV3r1_TFovdQCgw\"),\n        (False, \"./nonregistered_storage/\", \".csv\", \"iGtHiFEBV3r1_TFovdQCgw\"),\n        (False, \"./nonregistered_storage/\", \"\", \"iGtHiFEBV3r1_TFovdQCgw\"),\n    ],\n)\ndef get_test_filepaths(request):  # -> Tuple[bool, Path, Path, Path, str]\n    is_in_registered_storage: bool = request.param[0]\n    root_dir: Path = Path(request.param[1])\n    suffix: str = request.param[2]\n    hash_test_dir: str = request.param[3]\n    if is_in_registered_storage:\n        # ensure that it's actually registered\n        if ln.Storage.filter(root=root_dir.resolve().as_posix()).one_or_none() is None:\n            ln.Storage(root=root_dir.resolve().as_posix(), type=\"local\").save()\n    else:\n        assert (\n            ln.Storage.filter(root=root_dir.resolve().as_posix()).one_or_none() is None\n        )\n    test_dirpath = root_dir / \"my_dir/\"\n    test_dirpath.mkdir(parents=True, exist_ok=True)\n    # create a first file\n    test_filepath0 = test_dirpath / f\"my_file{suffix}\"\n    test_filepath0.write_text(\"0\")\n    # create a second, duplicated file\n    test_filepath1 = test_dirpath / f\"my_file1{suffix}\"\n    test_filepath1.write_text(\"0\")\n    # create a non-duplicated file\n    test_filepath2 = test_dirpath / f\"my_file2{suffix}\"\n    test_filepath2.write_text(\"1\")\n    # return a boolean indicating whether test filepath is in default storage\n    # and the test filepath\n    yield (\n        is_in_registered_storage,\n        root_dir,\n        test_dirpath,\n        test_filepath0,\n        suffix,\n        hash_test_dir,\n    )\n    shutil.rmtree(test_dirpath)\n\n\n@pytest.fixture(scope=\"function\")\ndef registered_storage_file_and_folder():\n    root_dir = Path(\"./registered_storage_suffix_fixture\")\n    storage_root = root_dir.resolve().as_posix()\n    if ln.Storage.filter(root=storage_root).one_or_none() is None:\n        ln.Storage(root=storage_root, type=\"local\").save()\n\n    test_dirpath = root_dir / \"suffix_fixture_dir\"\n    test_dirpath.mkdir(parents=True, exist_ok=True)\n    test_filepath = test_dirpath / \"suffix_fixture_file.csv\"\n    test_filepath.write_text(\"a,b\\n1,2\\n\")\n\n    folder_path = root_dir / \"suffix_fixture_folder\"\n    folder_path.mkdir(parents=True, exist_ok=True)\n    (folder_path / \"nested.txt\").write_text(\"content\")\n\n    yield test_filepath, folder_path\n\n    shutil.rmtree(test_dirpath, ignore_errors=True)\n    shutil.rmtree(folder_path, ignore_errors=True)\n\n\n@pytest.fixture(scope=\"session\")\ndef example_dataframe():\n    return pd.DataFrame({\"feat1\": [1, 2], \"feat2\": [3, 4]})\n\n\n@pytest.fixture(scope=\"session\")\ndef adata_file():\n    adata = ad.AnnData(\n        X=np.array([[1, 2, 3], [4, 5, 6]]),\n        obs={\"feat1\": [\"A\", \"B\"]},\n        var=pd.DataFrame(index=[\"MYC\", \"TCF7\", \"GATA1\"]),\n        obsm={\"X_pca\": np.array([[1, 2], [3, 4]])},\n    )\n    filepath = Path(\"adata_file.h5ad\")\n    adata.write(filepath)\n    yield \"adata_file.h5ad\"\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"session\")\ndef tsv_file():\n    filepath = Path(\"test.tsv\")\n    pd.DataFrame([1, 2]).to_csv(filepath, sep=\"\\t\")\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_file():\n    filepath = Path(\"test.zip\")\n    pd.DataFrame([1, 2]).to_csv(filepath, sep=\"\\t\")\n    yield filepath\n    filepath.unlink(missing_ok=True)\n\n\n@pytest.fixture(scope=\"session\")\ndef yaml_file():\n    filepath = Path(\"test.yaml\")\n    dct = {\"a\": 1, \"b\": 2}\n    with open(filepath, \"w\") as f:\n        yaml.dump(dct, f)\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"session\")\ndef fcs_file():\n    fcs_path = ln.examples.datasets.file_fcs_alpert19()\n    yield fcs_path\n    fcs_path.unlink()\n\n\n@pytest.fixture(scope=\"session\")\ndef mudata_file(get_small_mdata):\n    filepath = Path(\"test.h5mu\")\n    get_small_mdata.write(filepath)\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"session\")\ndef spatialdata_file(get_small_sdata):\n    filepath = Path(\"test.zarr\")\n    get_small_sdata.write(filepath)\n    yield filepath\n    shutil.rmtree(filepath)\n"
  },
  {
    "path": "tests/core/notebooks/basic-r-notebook.Rmd.cleaned.html",
    "content": "<!doctype html>\n<html>\n  <meta charset=\"utf-8\" />\n  \n  \n\n  <!-- rnb-text-begin -->\n  <!-- rnb-text-end -->\n  <!-- rnb-chunk-begin -->\n  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHViR2xpY21GeWVTaHNZVzFwYm5JcFhHNWNibVJpSUR3dElHTnZibTVsWTNRb0tWeHVZR0JnSW4wPSAtLT5cblxuYGBgclxubGlicmFyeShsYW1pbnIpXG5cbmRiIDwtIGNvbm5lY3QoKVxuYGBgXG5cbjwhLS0gcm5iLXNvdXJjZS1lbmQgLS0+XG4ifQ== -->\n  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxubGlicmFyeShsYW1pbnIpXG5cbmRiIDwtIGNvbm5lY3QoKVxuYGBgIn0= -->\n  <pre class=\"r\"><code>library(laminr)\n\ndb &lt;- connect()</code></pre>\n  <!-- rnb-source-end -->\n  <!-- rnb-output-end -->\n  <!-- rnb-output-begin eyJkYXRhIjoi4oaSIGNvbm5lY3RlZCBsYW1pbmRiOiBsYW1pbmxhYnMvbGFtaW5kYXRhXG4ifQ== -->\n  <pre><code>→ connected lamindb: laminlabs/lamindata</code></pre>\n  <!-- rnb-output-end -->\n  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHVaR0lrZEhKaFkyc29YQ0pzVDFOamRYaEVWRVJGTUhFd01EQXdYQ0lwWEc1Z1lHQWlmUT09IC0tPlxuXG5gYGByXG5kYiR0cmFjayhcImxPU2N1eERUREUwcTAwMDBcIilcbmBgYFxuXG48IS0tIHJuYi1zb3VyY2UtZW5kIC0tPlxuIn0= -->\n  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuZGIkdHJhY2soXCJsT1NjdXhEVERFMHEwMDAwXCIpXG5gYGAifQ== -->\n  <pre class=\"r\"><code>db$track(&quot;lOScuxDTDE0q0000&quot;)</code></pre>\n  <!-- rnb-source-end -->\n  <!-- rnb-output-end -->\n  <!-- rnb-output-begin eyJkYXRhIjoi4oaSIGxvYWRlZCBUcmFuc2Zvcm0oJ2xPU2N1eERUJyksIHN0YXJ0ZWQgUnVuKCdHV3BhVHRVZycpIGF0IDIwMjQtMTItMDEgMTc6NDk6MTggVVRDXG4ifQ== -->\n  <pre><code>→ loaded Transform(&#39;lOScuxDT&#39;), started Run(&#39;GWpaTtUg&#39;) at 2024-12-01 17:49:18 UTC</code></pre>\n  <!-- rnb-output-end -->\n  <!-- rnb-chunk-end -->\n  <!-- rnb-text-begin -->\n  <!-- rnb-text-end -->\n  <!-- rnb-chunk-begin -->\n  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHVaR0lrWm1sdWFYTm9LQ2xjYm1CZ1lDSjkgLS0+XG5cbmBgYHJcbmRiJGZpbmlzaCgpXG5gYGBcblxuPCEtLSBybmItc291cmNlLWVuZCAtLT5cbiJ9 -->\n  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuZGIkZmluaXNoKClcbmBgYCJ9 -->\n  <pre class=\"r\"><code>db$finish()</code></pre>\n  <!-- rnb-source-end -->\n  <!-- rnb-output-end -->\n  <!-- rnb-output-begin eyJkYXRhIjoiRXJyb3IgaW4gcHlfY2FsbF9pbXBsKGNhbGxhYmxlLCBjYWxsX2FyZ3MkdW5uYW1lZCwgY2FsbF9hcmdzJG5hbWVkKSA6IFxuICBsYW1pbmRiLmNvcmUuZXhjZXB0aW9ucy5Ob3RlYm9va05vdFNhdmVkOiBQbGVhc2Ugc2F2ZSB0aGUgbm90ZWJvb2sgaW4gUlN0dWRpbyAoc2hvcnRjdXQgYENNRCArIHNgKSB3aXRoaW4gMiBzZWMgYmVmb3JlIGNhbGxpbmcgYGRiJGZpbmlzaCgpYFxuUnVuIFx1MDAxYl04Oztyc3R1ZGlvOnJ1bjpyZXRpY3VsYXRlOjpweV9sYXN0X2Vycm9yKClcdTAwMDdgcmV0aWN1bGF0ZTo6cHlfbGFzdF9lcnJvcigpYFx1MDAxYl04OztcdTAwMDcgZm9yIGRldGFpbHMuXG4ifQ== -->\n  <pre><code>MoreOUTPUT </code></pre>\n  <!-- rnb-output-end -->\n  <!-- rnb-chunk-end -->\n  <!-- rnb-text-begin -->\n</html>\n"
  },
  {
    "path": "tests/core/notebooks/basic-r-notebook.Rmd.html",
    "content": "<!doctype html>\n<html>\n  <meta charset=\"utf-8\" />\n  <title>My exemplary R analysis</title>\n  <h1 class=\"title toc-ignore\">My exemplary R analysis</h1>\n\n  <!-- rnb-text-begin -->\n  <!-- rnb-text-end -->\n  <!-- rnb-chunk-begin -->\n  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHViR2xpY21GeWVTaHNZVzFwYm5JcFhHNWNibVJpSUR3dElHTnZibTVsWTNRb0tWeHVZR0JnSW4wPSAtLT5cblxuYGBgclxubGlicmFyeShsYW1pbnIpXG5cbmRiIDwtIGNvbm5lY3QoKVxuYGBgXG5cbjwhLS0gcm5iLXNvdXJjZS1lbmQgLS0+XG4ifQ== -->\n  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxubGlicmFyeShsYW1pbnIpXG5cbmRiIDwtIGNvbm5lY3QoKVxuYGBgIn0= -->\n  <pre class=\"r\"><code>library(laminr)\n\ndb &lt;- connect()</code></pre>\n  <!-- rnb-source-end -->\n  <!-- rnb-output-end -->\n  <!-- rnb-output-begin eyJkYXRhIjoi4oaSIGNvbm5lY3RlZCBsYW1pbmRiOiBsYW1pbmxhYnMvbGFtaW5kYXRhXG4ifQ== -->\n  <pre><code>→ connected lamindb: laminlabs/lamindata</code></pre>\n  <!-- rnb-output-end -->\n  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHVaR0lrZEhKaFkyc29YQ0pzVDFOamRYaEVWRVJGTUhFd01EQXdYQ0lwWEc1Z1lHQWlmUT09IC0tPlxuXG5gYGByXG5kYiR0cmFjayhcImxPU2N1eERUREUwcTAwMDBcIilcbmBgYFxuXG48IS0tIHJuYi1zb3VyY2UtZW5kIC0tPlxuIn0= -->\n  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuZGIkdHJhY2soXCJsT1NjdXhEVERFMHEwMDAwXCIpXG5gYGAifQ== -->\n  <pre class=\"r\"><code>db$track(&quot;lOScuxDTDE0q0000&quot;)</code></pre>\n  <!-- rnb-source-end -->\n  <!-- rnb-output-end -->\n  <!-- rnb-output-begin eyJkYXRhIjoi4oaSIGxvYWRlZCBUcmFuc2Zvcm0oJ2xPU2N1eERUJyksIHN0YXJ0ZWQgUnVuKCdHV3BhVHRVZycpIGF0IDIwMjQtMTItMDEgMTc6NDk6MTggVVRDXG4ifQ== -->\n  <pre><code>→ loaded Transform(&#39;lOScuxDT&#39;), started Run(&#39;GWpaTtUg&#39;) at 2024-12-01 17:49:18 UTC</code></pre>\n  <!-- rnb-output-end -->\n  <!-- rnb-chunk-end -->\n  <!-- rnb-text-begin -->\n  <!-- rnb-text-end -->\n  <!-- rnb-chunk-begin -->\n  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHVaR0lrWm1sdWFYTm9LQ2xjYm1CZ1lDSjkgLS0+XG5cbmBgYHJcbmRiJGZpbmlzaCgpXG5gYGBcblxuPCEtLSBybmItc291cmNlLWVuZCAtLT5cbiJ9 -->\n  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuZGIkZmluaXNoKClcbmBgYCJ9 -->\n  <pre class=\"r\"><code>db$finish()</code></pre>\n  <!-- rnb-source-end -->\n  <!-- rnb-output-end -->\n  <!-- rnb-output-begin eyJkYXRhIjoiRXJyb3IgaW4gcHlfY2FsbF9pbXBsKGNhbGxhYmxlLCBjYWxsX2FyZ3MkdW5uYW1lZCwgY2FsbF9hcmdzJG5hbWVkKSA6IFxuICBsYW1pbmRiLmNvcmUuZXhjZXB0aW9ucy5Ob3RlYm9va05vdFNhdmVkOiBQbGVhc2Ugc2F2ZSB0aGUgbm90ZWJvb2sgaW4gUlN0dWRpbyAoc2hvcnRjdXQgYENNRCArIHNgKSB3aXRoaW4gMiBzZWMgYmVmb3JlIGNhbGxpbmcgYGRiJGZpbmlzaCgpYFxuUnVuIFx1MDAxYl04Oztyc3R1ZGlvOnJ1bjpyZXRpY3VsYXRlOjpweV9sYXN0X2Vycm9yKClcdTAwMDdgcmV0aWN1bGF0ZTo6cHlfbGFzdF9lcnJvcigpYFx1MDAxYl04OztcdTAwMDcgZm9yIGRldGFpbHMuXG4ifQ== -->\n  <pre><code>MoreOUTPUT ! please hit SHORTCUT to save the notebook in your editor and re-run finish()</code></pre>\n  <!-- rnb-output-end -->\n  <!-- rnb-chunk-end -->\n  <!-- rnb-text-begin -->\n</html>\n"
  },
  {
    "path": "tests/core/notebooks/duplicate/with-title-initialized-consecutive-finish.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# My duplicated test notebook (consecutive) with `ln.finish()`\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"This has actually different content than the original one in the `notebooks/` folder.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import lamindb as ln\\n\",\n    \"\\n\",\n    \"ln.track()\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"py310\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.12.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "tests/core/notebooks/load_schema.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import lamindb as ln\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"1\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# this is a test case because we had an issue with path resolution at some point: https://github.com/laminlabs/lamindb/pull/3211\\n\",\n    \"valid_features = ln.examples.schemas.valid_features()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"2\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"valid_features.delete(permanent=True)\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.12.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "tests/core/notebooks/no-title.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"A notebook without title.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"1\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import lamindb as ln\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"2\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# pass stem uid\\n\",\n    \"ln.track(\\\"123456789ABC\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"3\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"assert ln.context.transform.description is None\\n\",\n    \"assert ln.context.transform.key == \\\"no-title.ipynb\\\"\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"py312\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.12.8\"\n  },\n  \"nbproject\": {\n   \"id\": \"Irn3xQyQ40GU\",\n   \"pypackage\": {\n    \"nbproject\": \"0.0.7+2.g8521e30\"\n   },\n   \"time_init\": \"2022-06-08T14:42:31.551211+00:00\",\n   \"version\": \"0\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "tests/core/notebooks/with-title-initialized-consecutive-finish-not-last-cell.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# My test notebook (consecutive) with `ln.finish()` not in last cell\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import lamindb as ln\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# do not pass uid purposefully\\n\",\n    \"ln.track()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\\"my consecutive cell\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ln.finish(ignore_non_consecutive=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\\"my consecutive cell\\\")\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"py39\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.12.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "tests/core/notebooks/with-title-initialized-consecutive-finish.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# My test notebook (consecutive) with `ln.finish()`\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import lamindb as ln\\n\",\n    \"import pytest\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"with pytest.raises(ln.errors.InvalidArgument) as error:\\n\",\n    \"    ln.track(\\\"ujPaFZ\\\")\\n\",\n    \"print(error.exconly())\\n\",\n    \"assert error.exconly().startswith(\\n\",\n    \"    'lamindb.errors.InvalidArgument: Please pass an auto-generated uid instead of \\\"ujPaFZ\\\". Resolve by running:'\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# with uid passed\\n\",\n    \"ln.track(\\\"ujPaFZatnMLG0000\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\\"my consecutive cell\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"print(\\\"my consecutive cell\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ln.finish()\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"py312\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.12.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "tests/core/scripts/duplicate1/script-to-test-versioning.py",
    "content": "import lamindb as ln\n\nln.context.version = \"1\"\nln.track(\"Ro1gl7n8YrdH0001\")\n"
  },
  {
    "path": "tests/core/scripts/duplicate2/script-to-test-versioning.py",
    "content": "import lamindb as ln\n\nln.context.version = \"2\"\nln.track(\"Ro1gl7n8YrdH0002\")\n\nassert ln.context.transform.version_tag == \"2\"\n"
  },
  {
    "path": "tests/core/scripts/duplicate3/script-to-test-versioning.py",
    "content": "import lamindb as ln\n\nln.context.version = \"3\"\nln.track(\"Ro1gl7n8YrdH0002\")\n"
  },
  {
    "path": "tests/core/scripts/duplicate4/script-to-test-versioning.py",
    "content": "import lamindb as ln\n\nln.track()\n"
  },
  {
    "path": "tests/core/scripts/duplicate5/script-to-test-versioning.py",
    "content": "import lamindb as ln\n\n# different from the one in duplicate4\nln.track()\n\nln.finish()\n"
  },
  {
    "path": "tests/core/scripts/script-to-test-filename-change.py",
    "content": "import lamindb as ln\n\nln.track(\"Ro1gl7n8YrdH0001\")\n"
  },
  {
    "path": "tests/core/scripts/script-to-test-versioning.py",
    "content": "import lamindb as ln\n\nln.context.version = \"1\"\nln.track(\"Ro1gl7n8YrdH0000\")\n"
  },
  {
    "path": "tests/core/test_artifact_anndata_with_curation.py",
    "content": "import lamindb as ln\n\n\ndef test_create_anndata_with_curation():\n    adata = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\n    feature1 = ln.Feature(name=\"sample_note\", dtype=str).save()\n\n    # ingest the first time\n    artifact = ln.Artifact.from_anndata(\n        adata,\n        key=\"examples/mini_immuno1.h5ad\",\n        schema=\"ensembl_gene_ids_and_valid_features_in_obs\",\n    ).save()\n    # capture the obs_schema because we'll overwrite it\n    obs_schema = artifact.features.slots[\"obs\"]\n\n    # define another feature so that upon re-ingestion, we track more than before\n    # (this also tests non-trivial idempotency)\n    feature2 = ln.Feature(name=\"treatment_time_h\", dtype=int).save()\n    artifact = ln.Artifact.from_anndata(\n        adata,\n        key=\"examples/mini_immuno1.h5ad\",\n        schema=\"ensembl_gene_ids_and_valid_features_in_obs\",\n    ).save()\n\n    schemas = artifact.features.slots\n    artifact.delete(permanent=True)\n    for schema in schemas.values():\n        schema.delete(permanent=True)\n    obs_schema.delete(permanent=True)\n    feature1.delete(permanent=True)\n    feature2.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_artifact_basics.py",
    "content": "\"\"\"Artifact tests.\n\nAlso see `test_artifact_folders.py` for tests of folder-like artifacts.\n\"\"\"\n\n# ruff: noqa: F811\n\nimport shutil\nimport sys\nfrom pathlib import Path, PurePosixPath\nfrom types import ModuleType, SimpleNamespace\nfrom unittest.mock import patch\n\nimport anndata as ad\nimport h5py\nimport lamindb as ln\nimport lamindb_setup\nimport mudata as md\nimport pandas as pd\nimport pytest\nimport zarr\nfrom _dataset_fixtures import (  # noqa\n    get_mini_csv,\n    get_small_adata,\n    get_small_mdata,\n    get_small_sdata,\n)\nfrom lamindb.core.loaders import load_fcs, load_to_memory, load_tsv\nfrom lamindb.core.storage.paths import (\n    AUTO_KEY_PREFIX,\n    auto_storage_key_from_artifact_uid,\n    check_path_is_child_of_root,\n    delete_storage,\n)\nfrom lamindb.errors import (\n    FieldValidationError,\n    InvalidArgument,\n)\nfrom lamindb.models.artifact import (\n    data_is_scversedatastructure,\n    get_relative_path_to_directory,\n    process_data,\n)\nfrom lamindb_setup.core.upath import (\n    CloudPath,\n    LocalPathClasses,\n    UPath,\n    extract_suffix_from_path,\n)\n\n# how do we properly abstract out the default storage variable?\n# currently, we're only mocking it through `storage` as set in conftest.py\n\nln.settings.verbosity = \"success\"\n\n\n@pytest.fixture\ndef data(request):\n    if request.param == \"get_small_adata\":\n        return request.getfixturevalue(\"get_small_adata\")\n    else:\n        return request.param\n\n\n# -------------------------------------------------------------------------------------\n# Basic construction\n# -------------------------------------------------------------------------------------\n\n\ndef test_basic_validation():\n    # extra kwargs\n    with pytest.raises(FieldValidationError):\n        ln.Artifact(\"testpath.csv\", description=\"test1b\", extra_kwarg=\"extra\")\n\n    # > 1 args\n    with pytest.raises(ValueError) as error:\n        ln.Artifact(\"testpath.csv\", \"testpath.csv\")\n    assert error.exconly() == \"ValueError: Only one non-keyword arg allowed: path\"\n\n    # AUTO_KEY_PREFIX in key\n    with pytest.raises(ValueError) as error:\n        ln.Artifact(\".gitignore\", key=\".lamindb/test_df.parquet\")\n    assert (\n        error.exconly()\n        == f\"ValueError: Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`\"\n    )\n\n    # path that contains AUTO_KEY_PREFIX\n    with pytest.raises(ValueError) as error:\n        ln.Artifact(\".lamindb/test_df.parquet\", description=\"Test\")\n    assert (\n        error.exconly()\n        == f\"ValueError: Do not pass path inside the `{AUTO_KEY_PREFIX}` directory.\"\n    )\n\n\n@pytest.mark.parametrize(\"key_is_virtual\", [True, False])\n@pytest.mark.parametrize(\"key\", [None, \"my_new_dir/my_artifact.csv\", \"nosuffix\"])\n@pytest.mark.parametrize(\"description\", [None, \"my description\"])\ndef test_create_from_path_file(get_test_filepaths, key_is_virtual, key, description):\n    ln.settings.creation._artifact_use_virtual_keys = key_is_virtual\n    is_in_registered_storage = get_test_filepaths[0]\n    root_dir = get_test_filepaths[1]\n    test_filepath = get_test_filepaths[3]\n    suffix = get_test_filepaths[4]  # path suffix\n    if key is not None:\n        key_suffix = extract_suffix_from_path(\n            PurePosixPath(key), arg_name=\"key\"\n        )  # key suffix\n    else:\n        key_suffix = None\n    # this tests if insufficient information is being provided\n    if key is None and not is_in_registered_storage and description is None:\n        # this can fail because ln.track() might set a global run context\n        # in that case, the Artifact would have a run that's not None and the\n        # error below wouldn't be thrown\n        with pytest.raises(ValueError) as error:\n            artifact = ln.Artifact(test_filepath, key=key, description=description)\n        assert (\n            error.exconly()\n            == \"ValueError: Pass one of key, run or description as a parameter\"\n        )\n        return None\n    elif key is not None and suffix != key_suffix:\n        with pytest.raises(InvalidArgument) as error:\n            artifact = ln.Artifact(test_filepath, key=key, description=description)\n        assert error.exconly() == (\n            f\"lamindb.errors.InvalidArgument: The passed path's suffix '{suffix}' must match the passed key's suffix '{key_suffix}'.\"\n        )\n        return None\n    elif key is not None and is_in_registered_storage:\n        inferred_key = get_relative_path_to_directory(\n            path=test_filepath, directory=root_dir\n        ).as_posix()\n        try:\n            artifact = ln.Artifact(test_filepath, key=key, description=description)\n        except InvalidArgument as error:\n            assert str(error) == (\n                f\"The path '{test_filepath}' is already in registered storage\"\n                f\" '{root_dir.resolve().as_posix()}' with key '{inferred_key}'\\nYou\"\n                f\" passed conflicting key '{key}': please move the file before\"\n                \" registering it.\"\n            )\n        return None\n    else:\n        artifact = ln.Artifact(test_filepath, key=key, description=description)\n        assert artifact._state.adding  # make sure that this is a new file in the db\n    assert (\n        artifact.description is None\n        if description is None\n        else artifact.description == description\n    )\n    assert artifact.suffix == suffix\n    assert artifact.n_files is None\n    artifact.save()\n    assert artifact.path.exists()\n    # check get by path\n    assert ln.Artifact.get(path=artifact.path) == artifact\n\n    if key is None:\n        assert (\n            artifact.key == f\"my_dir/my_file{suffix}\"\n            if is_in_registered_storage\n            else artifact.key is None\n        )\n        if is_in_registered_storage:\n            assert artifact.storage.root == root_dir.resolve().as_posix()\n            assert artifact.path == test_filepath.resolve()\n        else:\n            assert artifact.storage.root == lamindb_setup.settings.storage.root_as_str\n            assert (\n                artifact.path\n                == lamindb_setup.settings.storage.root\n                / f\".lamindb/{artifact.uid}{suffix}\"\n            )\n    else:\n        assert artifact.key == key\n        assert artifact._key_is_virtual == key_is_virtual\n        if is_in_registered_storage:\n            # this would only hit if the key matches the correct key\n            assert artifact.storage.root == root_dir.resolve().as_posix()\n            assert (\n                artifact.path == root_dir / f\"{key}{suffix}\" == test_filepath.resolve()\n            )\n        else:\n            # file is moved into default storage\n            if key_is_virtual:\n                assert (\n                    artifact.path\n                    == lamindb_setup.settings.storage.root\n                    / f\".lamindb/{artifact.uid}{suffix}\"\n                )\n            else:\n                assert artifact.path == lamindb_setup.settings.storage.root / key\n    # only delete from storage if a file copy took place\n    delete_from_storage = str(test_filepath.resolve()) != str(artifact.path)\n    artifact.delete(permanent=True, storage=delete_from_storage)\n    ln.settings.creation._artifact_use_virtual_keys = True\n\n\n@pytest.mark.parametrize(\"key_is_virtual\", [True, False])\n@pytest.mark.parametrize(\"key\", [None, \"my_new_file.tsv\"])\ndef test_create_from_path_file_with_explicit_key_is_virtual(\n    tsv_file, key_is_virtual, key\n):\n    artifact = ln.Artifact(\n        tsv_file,\n        description=\"test explicit key is virtual\",\n        key=key,\n        _key_is_virtual=key_is_virtual,\n    )\n    assert artifact.key == key\n    assert artifact._key_is_virtual == key_is_virtual\n    artifact.save()\n    assert artifact.path.exists()\n\n    root = lamindb_setup.settings.storage.root\n    if not key_is_virtual and key is not None:\n        assert artifact.path == root / key\n    else:\n        assert artifact.path == root / f\".lamindb/{artifact.uid}.tsv\"\n\n    artifact.delete(permanent=True, storage=True)\n\n\ndef test_create_from_empty_files_skips_hash_lookup(tmp_path):\n    path_1 = tmp_path / \"empty-1.txt\"\n    path_2 = tmp_path / \"empty-2.txt\"\n    path_1.write_text(\"\")\n    path_2.write_text(\"\")\n\n    artifact_1 = ln.Artifact(path_1, key=f\"{tmp_path.name}/empty-1.txt\").save()\n    artifact_2 = ln.Artifact(path_2, key=f\"{tmp_path.name}/empty-2.txt\")\n\n    assert artifact_2.uid != artifact_1.uid\n    assert artifact_2.key == f\"{tmp_path.name}/empty-2.txt\"\n    assert artifact_2.hash == artifact_1.hash\n\n    artifact_2.save()\n    assert artifact_2.id != artifact_1.id\n\n    artifact_2.delete(permanent=True)\n    artifact_1.delete(permanent=True)\n\n\n@pytest.mark.parametrize(\"key\", [None, \"my_new_folder\"])\ndef test_create_from_path_folder(get_test_filepaths, key):\n    # get variables from fixture\n    is_in_registered_storage = get_test_filepaths[0]\n    test_dirpath = get_test_filepaths[2]\n    hash_test_dir = get_test_filepaths[5]\n    if key is None and not is_in_registered_storage:\n        with pytest.raises(ValueError) as error:\n            ln.Artifact(test_dirpath, key=key)\n        assert error.exconly().startswith(\n            \"ValueError: Pass one of key, run or description as a parameter\"\n        )\n        return None\n    artifact1 = ln.Artifact(test_dirpath, key=key)\n    if key is not None and is_in_registered_storage:\n        assert artifact1._real_key is not None\n        # should fail because we are passing a path in an existing storage with a virtual key\n        with pytest.raises(ValueError) as error:\n            ln.Artifact(test_dirpath, key=key, _key_is_virtual=False)\n        assert error.exconly().startswith(\n            \"ValueError: Passing a path in an existing storage with a virtual key and _key_is_virtual=False is incompatible.\"\n        )\n    else:\n        assert artifact1._real_key is None\n    # check that passing _key_is_virtual=True is incompatible with a path in an existing storage without a virtual key\n    if key is None and is_in_registered_storage:\n        with pytest.raises(ValueError) as error:\n            ln.Artifact(test_dirpath, key=key, _key_is_virtual=True)\n        assert error.exconly().startswith(\n            \"ValueError: Passing a path in an existing storage without a virtual key and _key_is_virtual=True is incompatible.\"\n        )\n    assert artifact1.n_files == 3\n    assert artifact1.hash == hash_test_dir\n    assert artifact1._state.adding\n    assert artifact1.description is None\n    assert artifact1.path.exists()\n    artifact1.save()\n\n    # run tests on re-creating the Artifact\n    artifact2 = ln.Artifact(test_dirpath, key=key, description=\"something\")\n    assert not artifact2._state.adding\n    assert artifact1.id == artifact2.id\n    assert artifact1.uid == artifact2.uid\n    assert artifact1.storage == artifact2.storage\n    assert artifact2.path.exists()\n    assert artifact2.description == \"something\"\n\n    # now put another file in the test directory\n\n    # create a first file\n    test_filepath_added = test_dirpath / \"my_file_added.txt\"\n    test_filepath_added.write_text(\"2\")\n    artifact3 = ln.Artifact(test_dirpath, key=key, revises=artifact1)\n    assert artifact3.n_files == 4\n    assert artifact3.hash != hash_test_dir\n    assert artifact3._state.adding\n    assert artifact3.description is None\n    assert artifact3.path.exists()\n    artifact3.save()\n\n    # the state of artifact1 is lost, because artifact3 is stored at the same path\n    assert artifact3.overwrite_versions\n    assert artifact1.overwrite_versions\n    assert artifact3.path == artifact1.path\n    test_filepath_added.unlink()\n\n    # delete the artifact\n    artifact2.delete(permanent=True, storage=False)\n    artifact3.delete(permanent=True, storage=False)\n\n\ndef test_create_from_path_overwrite_versions_false(get_test_filepaths):\n    # get variables from fixture\n    is_in_registered_storage = get_test_filepaths[0]\n    test_dirpath = get_test_filepaths[2]\n    hash_test_dir = get_test_filepaths[5]\n    if is_in_registered_storage:\n        return\n    artifact1 = ln.Artifact(\n        test_dirpath, key=\"my_folder\", overwrite_versions=False\n    ).save()\n    assert artifact1.hash == hash_test_dir\n    # skip artifact2 because we already test this above\n    # create a first file\n    test_filepath_added = test_dirpath / \"my_file_added.txt\"\n    test_filepath_added.write_text(\"2\")\n    artifact3 = ln.Artifact(test_dirpath, key=\"my_folder\", overwrite_versions=False)\n    assert artifact3.hash != hash_test_dir\n    artifact3.save()\n    # the state of artifact1 is lost, because artifact3 is stored at the same path\n    assert not artifact3.overwrite_versions\n    assert not artifact1.overwrite_versions\n    assert artifact3.path != artifact1.path\n    test_filepath_added.unlink()\n    artifact1.delete(permanent=True, storage=False)\n    artifact3.delete(permanent=True, storage=False)\n\n\ndef test_delete_permanently_from_trash_folder(tmp_path):\n    folder_path = tmp_path / \"folder-overwrite-versions\"\n    folder_path.mkdir()\n    (folder_path / \"v1.txt\").write_text(\"v1\")\n    key = f\"{tmp_path.name}/folder-overwrite-versions\"\n\n    artifact = ln.Artifact(folder_path, key=key).save()\n    assert artifact.overwrite_versions\n\n    # First soft-delete (move to trash), then delete permanently.\n    artifact.delete()\n    artifact.refresh_from_db()\n    assert artifact.branch_id == -1\n\n    with patch(\"builtins.input\", return_value=\"y\"):\n        artifact.delete()\n\n    assert ln.Artifact.objects.filter(uid__startswith=artifact.stem_uid).count() == 0\n\n\ndef test_create_from_path_set_branch():\n    branch = ln.Branch(name=\"contrib1\").save()\n    artifact1 = ln.Artifact(\".gitignore\", key=\"test\", branch=branch).save()\n    # check hash lookup on different branch\n    artifact2 = ln.Artifact(\".gitignore\", key=\"test1\")\n    assert artifact1 == artifact2\n    # cleanup\n    artifact1.delete(permanent=True)\n    branch.delete(permanent=True)\n\n\n@pytest.mark.parametrize(\"key\", [None, \"my_new_folder\"])\ndef test_from_dir(get_test_filepaths, key):\n    is_in_registered_storage = get_test_filepaths[0]\n    test_dirpath = get_test_filepaths[2]\n    # the directory contains 3 files, two of them are duplicated\n    artifacts = ln.Artifact.from_dir(test_dirpath, key=key)\n    for artifact in artifacts:\n        if key is not None and is_in_registered_storage:\n            assert artifact._real_key is not None\n        else:\n            assert artifact._real_key is None\n    # we only return the duplicated ones\n    hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]\n    uids = [artifact.uid for artifact in artifacts]\n    assert len(set(hashes)) == len(hashes)\n    ln.UPath(test_dirpath).view_tree()\n    # now save\n    artifacts.save()\n    # now run again, because now we'll have hash-based lookup!\n    artifacts = ln.Artifact.from_dir(test_dirpath, key=key)\n    assert len(artifacts) == 2\n    assert len(set(artifacts)) == len(hashes)\n    queried_artifacts = ln.Artifact.filter(uid__in=uids)\n    for artifact in queried_artifacts:\n        artifact.delete(permanent=True, storage=False)\n\n\ndef test_create_from_dataframe(example_dataframe: pd.DataFrame):\n    df = example_dataframe\n    artifact = ln.Artifact.from_dataframe(df, description=\"test1\")\n    assert artifact.description == \"test1\"\n    assert artifact.key is None\n    assert artifact.otype == \"DataFrame\"\n    assert artifact.kind == \"dataset\"\n    assert artifact.n_observations == 2\n    assert hasattr(artifact, \"_local_filepath\")\n    artifact.key = \"my-test-dataset\"  # try changing key\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        artifact.save()\n    assert (\n        error.exconly()\n        == \"lamindb.errors.InvalidArgument: The suffix '' of the provided key is incorrect, it should be '.parquet'.\"\n    )\n    artifact.key = None  # restore\n    artifact.suffix = \".whatever\"  # changing suffix before first save is invalid\n    with pytest.raises(\n        ln.errors.InvalidArgument,\n        match=\"Cannot update the suffix of an artifact before it is saved.\",\n    ):\n        artifact.save()\n    artifact.suffix = \".parquet\"\n    artifact.save()\n    # check that the local filepath has been cleared\n    assert not hasattr(artifact, \"_local_filepath\")\n    del artifact\n\n    # now get an artifact from the database\n    artifact = ln.Artifact.get(description=\"test1\")\n    parquet_path = artifact.path\n    assert parquet_path.exists()\n    assert parquet_path.suffix == \".parquet\"\n    # test cancelling the move\n    artifact.suffix = \".whatever\"\n    with patch(\"builtins.input\", return_value=\"n\"):\n        assert artifact.save() is None\n    assert parquet_path.exists()\n\n    artifact = ln.Artifact.get(description=\"test1\")\n    assert artifact.suffix == \".parquet\"\n    artifact.suffix = \".whatever\"\n    with patch(\"builtins.input\", return_value=\"y\"):\n        artifact.save()\n    assert artifact.suffix == \".whatever\"\n    whatever_path = artifact.path\n    assert whatever_path.exists()\n    assert whatever_path.suffix == \".whatever\"\n    assert not parquet_path.exists()\n    artifact.suffix = \".parquet\"\n    with patch(\"builtins.input\", return_value=\"y\"):\n        artifact.save()\n    assert artifact.suffix == \".parquet\"\n    parquet_path_restored = artifact.path\n    assert parquet_path_restored.exists()\n    assert parquet_path_restored.suffix == \".parquet\"\n    assert not whatever_path.exists()\n\n    # coming from `key is None` that setting a key with different suffix is not allowed\n    artifact.key = \"my-test-dataset.suffix\"\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        artifact.save()\n    assert (\n        error.exconly()\n        == \"lamindb.errors.InvalidArgument: The suffix '.suffix' of the provided key is incorrect, it should be '.parquet'.\"\n    )\n\n    # coming from `key is None` test with no suffix\n    artifact.key = \"my-test-dataset\"\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        artifact.save()\n    assert (\n        error.exconly()\n        == \"lamindb.errors.InvalidArgument: The suffix '' of the provided key is incorrect, it should be '.parquet'.\"\n    )\n\n    # virtual key and suffix can now be updated together\n    artifact.key = \"my-test-dataset\"\n    artifact.suffix = \"\"\n    with patch(\"builtins.input\", return_value=\"y\"):\n        artifact.save()\n    assert artifact.suffix == \"\"\n    assert artifact.key == \"my-test-dataset\"\n\n    # changing the suffix updates the key suffix as well\n    artifact.suffix = \".parquet\"\n    with patch(\"builtins.input\", return_value=\"y\"):\n        artifact.save()\n    assert artifact.key == \"my-test-dataset.parquet\"\n\n    # coming from a .parquet key, test changing the key to no suffix\n    artifact.key = \"my-test-dataset\"\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        artifact.save()\n    assert (\n        error.exconly()\n        == \"lamindb.errors.InvalidArgument: The suffix '' of the provided key is incorrect, it should be '.parquet'.\"\n    )\n\n    artifact.delete(permanent=True)\n\n    # test from_dataframe with a path\n    path = Path(\"test_df_from_path.parquet\")\n    try:\n        example_dataframe.to_parquet(path)\n        for path_input in [path, str(path)]:\n            artifact = ln.Artifact.from_dataframe(\n                path_input, description=\"test from path\"\n            )\n            assert artifact.description == \"test from path\"\n            assert artifact.otype == \"DataFrame\"\n            assert artifact.kind == \"dataset\"\n            assert artifact.n_observations == 2\n            artifact.save()\n            artifact.delete(permanent=True)\n    finally:\n        path.unlink(missing_ok=True)\n\n\ndef test_dataframe_validate_suffix(example_dataframe: pd.DataFrame):\n    df = example_dataframe\n    artifact = ln.Artifact.from_dataframe(df, key=\"test_.parquet\")\n    assert artifact.suffix == \".parquet\"\n\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        artifact = ln.Artifact.from_dataframe(df, key=\"test_.def\")\n    assert (\n        error.exconly().partition(\",\")[0]\n        == \"lamindb.errors.InvalidArgument: The passed key's suffix '.def' must match the passed path's suffix '.parquet'.\"\n    )\n\n\ndef test_create_from_parquet_file_default_constructor(\n    example_dataframe: pd.DataFrame, ccaplog: pytest.LogCaptureFixture\n):\n    path = \"test_df.parquet\"\n    example_dataframe.to_parquet(path)\n    ln.Artifact(path, key=path)\n    assert \"data is a DataFrame, please use .from_dataframe()\" in ccaplog.text\n    Path(path).unlink()\n\n\ndef test_create_from_anndata(get_small_adata, adata_file, example_dataframe):\n    with pytest.raises(ValueError) as error:\n        ln.Artifact.from_anndata(example_dataframe, description=\"test1\")\n    assert (\n        \"data has to be an AnnData object or a path to AnnData-like\" in error.exconly()\n    )\n\n    for i, _a in enumerate([get_small_adata, adata_file]):\n        artifact = ln.Artifact.from_anndata(_a, description=\"test1\")\n        assert artifact.description == \"test1\"\n        assert artifact.key is None\n        assert artifact.otype == \"AnnData\"\n        assert artifact.kind == \"dataset\"\n        assert artifact.n_observations == 2\n        if i == 0:\n            assert hasattr(artifact, \"_local_filepath\")\n            artifact.save()\n            # check that the local filepath has been cleared\n            assert not hasattr(artifact, \"_local_filepath\")\n            artifact.delete(permanent=True)\n\n\ndef test_from_anndata_uses_h5ad_kwargs(get_small_adata):\n    artifact = ln.Artifact.from_anndata(\n        get_small_adata,\n        key=\"test_kwargs.h5ad\",\n        h5ad_kwargs={\"compression\": \"gzip\"},\n    )\n\n    local_path = artifact._local_filepath\n    with h5py.File(local_path, mode=\"r\") as store:\n        assert store[\"X\"].compression == \"gzip\"\n\n    local_path.unlink(missing_ok=True)\n\n\ndef test_from_anndata_uses_zarr_kwargs(get_small_adata):\n    chunks = (1, get_small_adata.n_vars)\n    artifact = ln.Artifact.from_anndata(\n        get_small_adata,\n        key=\"test_kwargs.zarr\",\n        format=\"zarr\",\n        zarr_kwargs={\"chunks\": chunks},\n    )\n\n    local_path = artifact._local_filepath\n    assert zarr.open(local_path, mode=\"r\")[\"X\"].chunks == chunks\n\n    shutil.rmtree(local_path)\n\n\ndef test_from_anndata_validate_suffix(get_small_adata):\n    artifact = ln.Artifact.from_anndata(get_small_adata, key=\"test_.h5ad\")\n    assert artifact.suffix == \".h5ad\"\n    artifact = ln.Artifact.from_anndata(\n        get_small_adata, format=\"h5ad\", key=\"test_.h5ad\"\n    )\n    assert artifact.suffix == \".h5ad\"\n    artifact = ln.Artifact.from_anndata(get_small_adata, key=\"test_.zarr\")\n    assert artifact.suffix == \".zarr\"\n\n    with pytest.raises(ValueError) as error:\n        artifact = ln.Artifact.from_anndata(get_small_adata, key=\"test_.def\")\n    assert (\n        error.exconly().partition(\",\")[0]\n        == \"ValueError: Error when specifying AnnData storage format\"\n    )\n\n    with pytest.raises(InvalidArgument) as error:\n        artifact = ln.Artifact.from_anndata(get_small_adata, key=\"test_\")\n    assert (\n        error.exconly().partition(\",\")[0]\n        == \"lamindb.errors.InvalidArgument: The passed key's suffix '' must match the passed path's suffix '.h5ad'.\"\n    )\n\n\ndef test_create_from_mudata(get_small_mdata, mudata_file, adata_file):\n    with pytest.raises(ValueError) as error:\n        ln.Artifact.from_mudata(adata_file, description=\"test1\")\n    assert \"data has to be a MuData object or a path to MuData-like\" in error.exconly()\n\n    for m in [get_small_mdata, mudata_file]:\n        af = ln.Artifact.from_mudata(m, description=\"test1\")\n        assert af.description == \"test1\"\n        assert af.key is None\n        assert af.otype == \"MuData\"\n        assert af.kind == \"dataset\"\n        if isinstance(m, md.MuData):\n            assert af.n_observations == 2\n\n\ndef test_create_from_spatialdata(\n    get_small_sdata, spatialdata_file, adata_file, ccaplog\n):\n    with pytest.raises(ValueError) as error:\n        ln.Artifact.from_spatialdata(adata_file, description=\"test1\")\n    assert (\n        \"data has to be a SpatialData object or a path to SpatialData-like\"\n        in error.exconly()\n    )\n\n    for s in [get_small_sdata, spatialdata_file]:\n        af = ln.Artifact(s, description=\"test1\")\n        assert af.description == \"test1\"\n        assert af.key is None\n        assert af.otype == \"SpatialData\"\n        assert af.kind is None\n        # n_observations not defined\n    assert \"data is a SpatialData, please use .from_spatialdata()\" in ccaplog.text\n    for s in [get_small_sdata, spatialdata_file]:\n        af = ln.Artifact.from_spatialdata(s, description=\"test1\")\n        assert af.description == \"test1\"\n        assert af.key is None\n        assert af.otype == \"SpatialData\"\n        assert af.kind == \"dataset\"\n        # n_observations not defined\n\n\n@pytest.mark.parametrize(\n    \"data\",\n    [\"get_small_adata\"],\n    indirect=True,\n)\ndef test_create_from_anndata_in_storage(data):\n    artifact = ln.Artifact.from_anndata(\n        data, description=\"test_create_from_anndata_memory\"\n    )\n    assert artifact.n_observations == data.n_obs\n    assert artifact.otype == \"AnnData\"\n    assert hasattr(artifact, \"_local_filepath\")\n    artifact.save()\n    # check that the local filepath has been cleared\n    assert not hasattr(artifact, \"_local_filepath\")\n\n\n# -------------------------------------------------------------------------------------\n# Life cycle management\n# -------------------------------------------------------------------------------------\n\n\ndef test_revise_recreate_artifact(example_dataframe: pd.DataFrame, ccaplog):\n    df = example_dataframe\n    # attempt to create a file with an invalid version\n    with pytest.raises(ValueError) as error:\n        artifact = ln.Artifact.from_dataframe(df, description=\"test\", version=0)\n    assert (\n        error.exconly()\n        == \"ValueError: `version` parameter must be `None` or `str`, e.g., '0.1', '1',\"\n        \" '2', etc.\"\n    )\n\n    # create a file and tag it with a version\n    key = \"my-test-dataset.parquet\"\n    artifact = ln.Artifact.from_dataframe(df, key=key, description=\"test\", version=\"1\")\n    assert artifact.version_tag == \"1\"\n    assert artifact.version == \"1\"\n    assert artifact.uid.endswith(\"0000\")\n    assert artifact.path.exists()  # because of cache file already exists\n    artifact.save()\n    assert artifact.path.exists()\n    assert artifact.suffix == \".parquet\"\n\n    with pytest.raises(ValueError) as error:\n        artifact_v2 = ln.Artifact.from_dataframe(df, revises=artifact, version=\"1\")\n    assert (\n        error.exconly()\n        == \"ValueError: Please change the version tag or leave it `None`, '1' is already taken\"\n    )\n\n    # create new file from old file\n    df.iloc[0, 0] = 99  # mutate dataframe so that hash lookup doesn't trigger\n    artifact_v2 = ln.Artifact.from_dataframe(df, revises=artifact)\n    assert artifact_v2.stem_uid == artifact.stem_uid\n    assert artifact_v2.uid.endswith(\"0001\")\n    # call this again\n    artifact_v2 = ln.Artifact.from_dataframe(df, revises=artifact)\n    assert artifact_v2.uid.endswith(\"0001\")\n    assert artifact_v2.stem_uid == artifact.stem_uid\n    assert artifact_v2.version_tag is None\n    assert (\n        artifact_v2.version == artifact_v2.uid[-4:]\n    )  # version falls back to uid suffix\n    assert artifact_v2.key == key\n    assert artifact.suffix == \".parquet\"\n    assert artifact_v2.description == \"test\"\n    assert artifact_v2._revises is not None\n    artifact_v2.save()\n    assert artifact_v2.path.exists()\n    assert artifact_v2._revises is None\n\n    # revise by providing `revises` argument (do not save)\n    df.iloc[0, 0] = 0  # mutate dataframe so that hash lookup doesn't trigger\n    artifact_v3 = ln.Artifact.from_dataframe(\n        df, description=\"test1\", revises=artifact_v2, version=\"2\"\n    )\n    assert artifact_v3.uid.endswith(\"0002\")\n    assert artifact_v3.stem_uid == artifact.stem_uid\n    assert artifact_v3.version_tag == \"2\"\n    assert artifact_v3.version == \"2\"\n    assert artifact_v3.description == \"test1\"\n    assert artifact_v3.key == key\n\n    # revise by matching on `key` (do not save)\n    artifact_v3 = ln.Artifact.from_dataframe(\n        df, key=key, description=\"test1\", version=\"2\"\n    )\n    assert artifact_v3.uid.endswith(\"0002\")\n    assert artifact_v3.stem_uid == artifact.stem_uid\n    assert artifact_v3.key == key\n    assert artifact_v3.version_tag == \"2\"\n    assert artifact_v3.version == \"2\"\n    assert artifact_v3.description == \"test1\"\n    assert artifact_v3.is_latest\n    assert artifact_v2.is_latest\n    artifact_v3.save()\n    # now r2 is no longer the latest version, but need to re-fresh from db\n    artifact_v2.refresh_from_db()\n    assert not artifact_v2.is_latest\n\n    # re-create based on hash when artifact_v3 is in trash\n    artifact_v3.delete()\n    artifact_new = ln.Artifact.from_dataframe(\n        df,\n        key=\"my-test-dataset1.parquet\",\n    )\n    assert artifact_new != artifact_v3\n    assert artifact_new.hash == artifact_v3.hash\n    assert artifact_new.key == \"my-test-dataset1.parquet\"\n    artifact_v3.restore()  # restore from trash\n\n    # re-create based on hash while providing same key, previous version\n    df.iloc[0, 0] = 99  # this is a previous version\n    artifact_new = ln.Artifact.from_dataframe(\n        df,\n        key=key,\n    )\n    assert artifact_new == artifact_v2\n    assert artifact_new.hash == artifact_v2.hash\n    assert artifact_new.key == key\n    assert artifact.is_latest is False\n\n    # re-create based on hash while providing a different key\n    df.iloc[0, 0] = 0\n    artifact_new = ln.Artifact.from_dataframe(\n        df,\n        key=\"my-test-dataset1.parquet\",\n        description=\"test1 updated\",\n    )\n    assert artifact_new == artifact_v3\n    assert artifact_new.hash == artifact_v3.hash\n    assert artifact_new.key == key  # old key\n    assert artifact_new.description == \"test1 updated\"\n\n    # re-create while skipping hash lookup with different key\n    artifact_v4 = ln.Artifact.from_dataframe(\n        df,\n        key=\"my-test-dataset1.parquet\",\n        skip_hash_lookup=True,\n    )\n    assert artifact_v4.uid != artifact_v3.uid\n    assert artifact_v4.hash == artifact_v3.hash\n    assert artifact_v4.key == \"my-test-dataset1.parquet\"\n    artifact_v4.save()  # this just saves a duplicated file\n\n    # re-create while skipping hash lookup with same key\n    artifact_new = ln.Artifact.from_dataframe(\n        df,\n        key=\"my-test-dataset1.parquet\",\n        skip_hash_lookup=True,\n    )\n    assert artifact_new.uid != artifact_v4.uid\n    assert artifact_new.stem_uid == artifact_v4.stem_uid\n    assert artifact_new.hash == artifact_v4.hash\n    artifact_new.save()  # should now violate unique constraint, falls back artifact_v4\n    assert artifact_new.uid == artifact_v4.uid\n\n    # re-create while skipping hash lookup artifact, move to trash before\n    artifact_v4.delete()\n    artifact_new = ln.Artifact.from_dataframe(\n        df,\n        key=\"my-test-dataset1.parquet\",\n        skip_hash_lookup=True,\n    )\n    assert artifact_new.uid != artifact_v4.uid\n    assert artifact_new.key == \"my-test-dataset1.parquet\"\n    assert \"returning artifact from trash\" not in ccaplog.text\n    artifact_new.save()  # should now violate unique constraint, retrieve artifact_v4 from trash\n    assert \"returning artifact from trash\" in ccaplog.text\n    assert artifact_new.uid == artifact_v4.uid\n    assert artifact_new.branch_id == 1  # restored to default branch\n\n    with pytest.raises(TypeError) as error:\n        ln.Artifact.from_dataframe(\n            df, description=\"test1a\", revises=ln.Record(name=\"test\")\n        )\n    assert error.exconly() == \"TypeError: `revises` has to be of type `Artifact`\"\n\n    artifact_v3.delete(permanent=True)\n    artifact_v2.delete(permanent=True)\n    artifact.delete(permanent=True)\n\n    # unversioned file\n    artifact = ln.Artifact.from_dataframe(df, description=\"test2\")\n    assert artifact.version_tag is None\n    assert artifact.version == artifact.uid[-4:]  # version falls back to uid suffix\n\n    # what happens if we don't save the old file?\n    # add a test for it!\n    artifact.save()\n\n    # create new file from old file\n    df.iloc[0, 0] = 101  # mutate dataframe so that hash lookup doesn't trigger\n    new_artifact = ln.Artifact.from_dataframe(df, revises=artifact)\n    assert artifact.version_tag is None\n    assert artifact.version == artifact.uid[-4:]  # version falls back to uid suffix\n    assert new_artifact.stem_uid == artifact.stem_uid\n    assert new_artifact.version_tag is None\n    assert (\n        new_artifact.version == new_artifact.uid[-4:]\n    )  # version falls back to uid suffix\n    assert new_artifact.description == artifact.description\n\n    new_artifact.save()\n    assert new_artifact.is_latest\n\n    assert \"you are saving to a non-latest version of the artifact\" not in ccaplog.text\n\n    old_artifact = ln.Artifact.get(artifact.id)  # to update is_latest from the db\n    assert not old_artifact.is_latest\n    old_artifact.description = \"change old version description\"\n    old_artifact.save()\n\n    assert \"you are saving to a non-latest version of the artifact\" in ccaplog.text\n\n    old_artifact.delete()\n    new_artifact.delete()\n\n    artifact_from_trash = ln.Artifact.get(new_artifact.uid[:-4])  # query with stem uid\n    assert artifact_from_trash.branch_id == -1\n\n    old_artifact.delete(permanent=True)\n    new_artifact.delete(permanent=True)\n    # check after cleanups\n    assert (\n        ccaplog.text.count(\"you are saving to a non-latest version of the artifact\")\n        == 1\n    )\n\n\ndef test_delete_and_restore_artifact(example_dataframe: pd.DataFrame):\n    df = example_dataframe\n    artifact = ln.Artifact.from_dataframe(\n        df, description=\"My test file to delete\"\n    ).save()\n    assert artifact.branch_id == 1\n    assert artifact.key is None or artifact._key_is_virtual\n    storage_path = artifact.path\n    # trash behavior\n    artifact.delete()\n    assert storage_path.exists()\n    assert artifact.branch_id == -1\n    assert ln.Artifact.filter(description=\"My test file to delete\").first() is None\n    assert ln.Artifact.filter(\n        description=\"My test file to delete\", branch__name=\"trash\"\n    ).first()\n    # no implicit restore from trash, we're making a new artifact\n    artifact_restored = ln.Artifact.from_dataframe(\n        df, description=\"My test file to delete\"\n    )\n    assert artifact_restored.branch_id == 1\n    assert artifact_restored != artifact\n    # permanent delete\n    artifact.delete(permanent=True)\n    assert (\n        ln.Artifact.filter(description=\"My test file to delete\", branch_id=None).first()\n        is None\n    )\n    assert not storage_path.exists()  # deletes from storage is key_is_virtual\n\n\ndef test_delete_storage():\n    with pytest.raises(FileNotFoundError):\n        delete_storage(ln.settings.storage.root / \"test-delete-storage\")\n\n\ndef test_recreate_after_artifact_moved_in_storage(ccaplog):\n    # this needs to be in a registered storage location\n    Path(\"./default_storage_unit_core/test_file.txt\").write_text(\"test content\")\n    artifact = ln.Artifact(\"./default_storage_unit_core/test_file.txt\").save()\n    # now rename the file within the storage location\n    Path(\"./default_storage_unit_core/test_file.txt\").rename(\n        \"./default_storage_unit_core/moved_file.txt\"\n    )\n    ln.Artifact(\"./default_storage_unit_core/moved_file.txt\").save()\n    assert \"updating previous key\" in ccaplog.text\n    artifact.delete(permanent=True, storage=True)\n\n\n# -------------------------------------------------------------------------------------\n# Storage\n# -------------------------------------------------------------------------------------\n\n\ndef test_move_artifact_exception_handling():\n    import lamindb.models.artifact as artifact_module\n\n    class FakeFS:\n        def __init__(\n            self,\n            copy_error: Exception | None = None,\n            exists: bool = False,\n            rm_error: Exception | None = None,\n        ):\n            self.copy_error = copy_error\n            self._exists = exists\n            self.rm_error = rm_error\n            self.rm_calls = 0\n\n        def exists(self, path: str) -> bool:\n            return self._exists\n\n        def copy(self, source: str, target: str, recursive: bool = True):\n            if self.copy_error is not None:\n                raise self.copy_error\n\n        def rm(self, path: str, recursive: bool = True):\n            self.rm_calls += 1\n            if self.rm_error is not None:\n                raise self.rm_error\n\n    source_path = UPath(\"s3://lamindb-ci/source-artifact\")\n    storage = SimpleNamespace(path=UPath(\"s3://lamindb-ci\"), id=42)\n\n    # _rm_catch_error helper branches\n    fs_missing = FakeFS(exists=False)\n    assert (\n        artifact_module._rm_catch_error(fs_missing, \"s3://lamindb-ci/missing\") is None\n    )\n    assert fs_missing.rm_calls == 0\n\n    fs_ok = FakeFS(exists=True)\n    assert artifact_module._rm_catch_error(fs_ok, \"s3://lamindb-ci/target\") is None\n    assert fs_ok.rm_calls == 1\n\n    rm_error = RuntimeError(\"rm failed\")\n    fs_fail = FakeFS(exists=True, rm_error=rm_error)\n    returned_error = artifact_module._rm_catch_error(fs_fail, \"s3://lamindb-ci/target\")\n    assert returned_error is rm_error\n    assert fs_fail.rm_calls == 1\n\n    # copy branch: copy fails and cleanup helper is included in the message\n    artifact_copy = SimpleNamespace(path=source_path, storage_id=None)\n    with (\n        patch.object(\n            artifact_module,\n            \"_s\",\n            return_value=SimpleNamespace(\n                auto_storage_key_from_artifact=lambda _: \"target-artifact\"\n            ),\n        ),\n        patch.object(\n            artifact_module,\n            \"fs_for_moving\",\n            return_value=FakeFS(copy_error=ValueError(\"copy failed\")),\n        ),\n        patch.object(\n            artifact_module,\n            \"_rm_catch_error\",\n            return_value=RuntimeError(\"rm failed\"),\n        ) as rm_mock,\n    ):\n        with pytest.raises(RuntimeError, match=\"Failed to copy artifact\"):\n            artifact_module._move_artifact_to_storage(artifact_copy, storage)\n        assert rm_mock.call_count == 1\n\n    # target exists branch: raises before attempting copy\n    artifact_exists = SimpleNamespace(path=source_path, storage_id=None)\n    with (\n        patch.object(\n            artifact_module,\n            \"_s\",\n            return_value=SimpleNamespace(\n                auto_storage_key_from_artifact=lambda _: \"target-artifact\"\n            ),\n        ),\n        patch.object(\n            artifact_module, \"fs_for_moving\", return_value=FakeFS(exists=True)\n        ),\n    ):\n        with pytest.raises(FileExistsError, match=\"already exists\"):\n            artifact_module._move_artifact_to_storage(artifact_exists, storage)\n\n    # same source and target path is rejected early\n    artifact_same_path = SimpleNamespace(path=source_path, storage_id=None)\n    with patch.object(\n        artifact_module,\n        \"_s\",\n        return_value=SimpleNamespace(\n            auto_storage_key_from_artifact=lambda _: \"source-artifact\"\n        ),\n    ):\n        with pytest.raises(ValueError, match=\"Cannot move to the same path\"):\n            artifact_module._move_artifact_to_storage(artifact_same_path, storage)\n\n    # verification branch: sorted sizes mismatch triggers cleanup helper\n    artifact_mismatch = SimpleNamespace(path=source_path, storage_id=None)\n    with (\n        patch.object(\n            artifact_module,\n            \"_s\",\n            return_value=SimpleNamespace(\n                auto_storage_key_from_artifact=lambda _: \"target-artifact\"\n            ),\n        ),\n        patch.object(artifact_module, \"fs_for_moving\", return_value=FakeFS()),\n        patch.object(artifact_module, \"_sorted_sizes\", side_effect=[[1], [2]]),\n        patch.object(\n            artifact_module,\n            \"_rm_catch_error\",\n            return_value=RuntimeError(\"rm failed\"),\n        ) as rm_mock,\n    ):\n        with pytest.raises(RuntimeError, match=\"Move verification failed\"):\n            artifact_module._move_artifact_to_storage(artifact_mismatch, storage)\n        assert rm_mock.call_count == 1\n\n    # source-removal branch: move succeeds but rm(source) fails and is logged\n    artifact_rm_fail = SimpleNamespace(path=source_path, storage_id=None)\n    with (\n        patch.object(\n            artifact_module,\n            \"_s\",\n            return_value=SimpleNamespace(\n                auto_storage_key_from_artifact=lambda _: \"target-artifact\"\n            ),\n        ),\n        patch.object(\n            artifact_module,\n            \"fs_for_moving\",\n            return_value=FakeFS(rm_error=RuntimeError()),\n        ),\n        patch.object(artifact_module, \"_sorted_sizes\", side_effect=[[1], [1]]),\n        patch.object(artifact_module.logger, \"error\") as logger_error_mock,\n    ):\n        artifact_module._move_artifact_to_storage(artifact_rm_fail, storage)\n        assert artifact_rm_fail.storage_id == storage.id\n        assert logger_error_mock.call_count == 1\n\n\n@pytest.mark.parametrize(\"suffix\", [\".txt\", \"\", None])\ndef test_auto_storage_key_from_artifact_uid(suffix):\n    test_id = \"abo389f\"\n    if suffix is None:\n        with pytest.raises(AssertionError):\n            auto_storage_key_from_artifact_uid(test_id, suffix, False)\n    else:\n        assert AUTO_KEY_PREFIX == \".lamindb/\"\n        storage_key = auto_storage_key_from_artifact_uid(test_id, suffix, False)\n        assert storage_key == f\"{AUTO_KEY_PREFIX}{test_id}{suffix}\"\n\n\ndef test_storage_root_upath_equivalence():\n    storage_root = UPath(\"s3://lamindb-ci\")\n    filepath = UPath(\"s3://lamindb-ci/test-data/Species.csv\")\n    assert filepath.parents[-1] == storage_root\n\n\ndef test_get_relative_path_to_directory():\n    # upath on S3\n    upath_root = UPath(\"s3://lamindb-ci\")\n    upath_directory1 = UPath(\"s3://lamindb-ci/test-data\")  # no trailing slash\n    upath_directory2 = UPath(\"s3://lamindb-ci/test-data/\")  # trailing slash\n    upath_file = UPath(\"s3://lamindb-ci/test-data/test.csv\")\n    assert (\n        \"test-data/test.csv\"\n        == get_relative_path_to_directory(upath_file, upath_root).as_posix()\n    )\n    assert (\n        \"test.csv\"\n        == get_relative_path_to_directory(upath_file, upath_directory1).as_posix()\n    )\n    assert (\n        \"test.csv\"\n        == get_relative_path_to_directory(upath_file, upath_directory2).as_posix()\n    )\n    # local path\n    root = Path(\"/lamindb-ci\")\n    upath = Path(\"/lamindb-ci/test-data/test.csv\")\n    assert (\n        \"test-data/test.csv\"\n        == get_relative_path_to_directory(upath, directory=root).as_posix()\n    )\n    local_upath_root = UPath(root.as_posix())\n    local_upath_file = UPath(upath.as_posix())\n    assert (\n        \"test-data/test.csv\"\n        == get_relative_path_to_directory(\n            local_upath_file, directory=local_upath_root\n        ).as_posix()\n    )\n    with pytest.raises(TypeError) as error:\n        get_relative_path_to_directory(upath, directory=\".\")\n    assert error.exconly() == \"TypeError: Directory not of type Path or UPath\"\n\n\ndef test_check_path_is_child_of_root():\n    # str\n    root = \"s3://lamindb-ci\"\n    upath = \"s3://lamindb-ci/test-data/test.csv\"\n    assert check_path_is_child_of_root(upath, root=root)\n    # str different protocols\n    root = \"prot1://lamindb-ci\"\n    upath = \"prot2://lamindb-ci/test-data/test.csv\"\n    assert not check_path_is_child_of_root(upath, root=root)\n    # UPath\n    root = UPath(\"s3://lamindb-ci\")\n    upath = UPath(\"s3://lamindb-ci/test-data/test.csv\")\n    assert check_path_is_child_of_root(upath, root=root)\n    upath2 = UPath(\"s3://lamindb-setup/test-data/test.csv\")\n    assert not check_path_is_child_of_root(upath2, root=root)\n    # local path\n    root = Path(\"/lamindb-ci\")\n    path = Path(\"/lamindb-ci/test-data/test.csv\")\n    assert check_path_is_child_of_root(path, root=root)\n    path = Path(\"/lamindb-other/test-data/test.csv\")\n    assert not check_path_is_child_of_root(path, root=root)\n    # Local & UPath\n    root = UPath(\"s3://lamindb-ci\")\n    path = Path(\"/lamindb-ci/test-data/test.csv\")\n    assert not check_path_is_child_of_root(path, root=root)\n    # different storage_options\n    upath = UPath(\"s3://lamindb-ci/test-data/test.csv\", cache_regions=True)\n    assert upath.storage_options != root.storage_options\n    assert check_path_is_child_of_root(upath, root=root)\n    # the second level\n    root = UPath(\"s3://lamindb-ci/test-data/\")\n    upath = UPath(\"s3://lamindb-ci/test-data/test/test.csv\")\n    assert check_path_is_child_of_root(upath, root=root)\n    upath2 = UPath(\"s3://lamindb-ci/test-data-1/test/test.csv\")\n    assert not check_path_is_child_of_root(upath2, root=root)\n    # http\n    assert check_path_is_child_of_root(\n        \"https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md\",\n        root=\"https://raw.githubusercontent.com\",\n    )\n    # s3 with endpoint\n    assert not check_path_is_child_of_root(\n        \"s3://bucket/key?endpoint_url=http://localhost:8000\",\n        root=\"s3://bucket/\",\n    )\n    assert not check_path_is_child_of_root(\n        \"s3://bucket/key/\",\n        root=\"s3://bucket/?endpoint_url=http://localhost:8000\",\n    )\n    assert check_path_is_child_of_root(\n        \"s3://bucket/key?endpoint_url=http://localhost:8000\",\n        root=\"s3://bucket?endpoint_url=http://localhost:8000\",\n    )\n    assert check_path_is_child_of_root(\n        UPath(\"s3://bucket/key\", endpoint_url=\"http://localhost:8000\"),\n        root=\"s3://bucket?endpoint_url=http://localhost:8000\",\n    )\n\n\ndef test_serialize_paths():\n    fp_str = ln.examples.datasets.anndata_file_pbmc68k_test().as_posix()\n    fp_path = Path(fp_str)\n\n    up_str = \"s3://lamindb-ci/test-unknown-storage-in-core-tests/test.csv\"\n    up_upath = UPath(up_str)\n\n    storage = ln.settings.storage.record\n    using_key = None\n\n    _, filepath, _, _, _ = process_data(\n        \"id\", fp_str, None, None, storage, using_key, skip_existence_check=True\n    )\n    assert isinstance(filepath, LocalPathClasses)\n    _, filepath, _, _, _ = process_data(\n        \"id\", fp_path, None, None, storage, using_key, skip_existence_check=True\n    )\n    assert isinstance(filepath, LocalPathClasses)\n\n    with pytest.raises(ln.errors.UnknownStorageLocation) as err:\n        _, filepath, _, _, _ = process_data(\n            \"id\",\n            up_str,\n            None,\n            None,\n            storage,\n            using_key,\n            skip_existence_check=True,\n        )\n    assert f\"Path {up_str} is not contained in any known storage\" in err.exconly()\n    storage = ln.Storage(\n        root=\"s3://lamindb-ci/test-unknown-storage-in-core-tests\"\n    ).save()\n    _, filepath, _, _, _ = process_data(\n        \"id\", up_str, None, None, storage, using_key, skip_existence_check=True\n    )\n    assert isinstance(filepath, CloudPath)\n    _, filepath, _, _, _ = process_data(\n        \"id\",\n        up_upath,\n        None,\n        None,\n        storage,\n        using_key,\n        skip_existence_check=True,\n    )\n    assert isinstance(filepath, CloudPath)\n    storage.delete()\n    Path(\"pbmc68k_test.h5ad\").unlink(missing_ok=True)\n\n\n# -------------------------------------------------------------------------------------\n# Data structures in storage\n# -------------------------------------------------------------------------------------\n\n\ndef test_data_is_anndata_paths():\n    assert data_is_scversedatastructure(\"something.h5ad\", \"AnnData\")\n    assert data_is_scversedatastructure(\"something.anndata.zarr\", \"AnnData\")\n    assert data_is_scversedatastructure(\n        \"s3://somewhere/something.anndata.zarr\", \"AnnData\"\n    )\n    assert not data_is_scversedatastructure(\"s3://somewhere/something.zarr\", \"AnnData\")\n\n\ndef test_data_is_anndata_anndatacessor(get_small_adata):\n    artifact = ln.Artifact(get_small_adata, key=\"test_adata.h5ad\").save()\n\n    with artifact.open(mode=\"r\") as access:\n        assert data_is_scversedatastructure(access, \"AnnData\")\n\n    artifact.delete(permanent=True)\n\n\ndef test_data_is_mudata_paths():\n    assert data_is_scversedatastructure(\"something.h5mu\", \"MuData\")\n    assert data_is_scversedatastructure(\"something.mudata.zarr\", \"MuData\")\n\n\ndef test_data_is_spatialdata_paths():\n    assert data_is_scversedatastructure(\"something.spatialdata.zarr\", \"SpatialData\")\n\n\n@pytest.mark.parametrize(\n    \"data,data_type,expected\",\n    [\n        (\"get_small_adata\", \"AnnData\", True),\n        (\"get_small_mdata\", \"MuData\", True),\n        (\"get_small_sdata\", \"SpatialData\", True),\n        (\"get_small_adata\", \"MuData\", False),\n        (\"get_small_mdata\", \"AnnData\", False),\n        (\"get_small_sdata\", \"AnnData\", False),\n        (\"get_small_adata\", None, True),\n        (pd.DataFrame(), \"AnnData\", False),\n        (None, \"AnnData\", False),\n        (None, None, False),\n    ],\n)\ndef test_data_is_scversedatastructure(request, data, data_type, expected):\n    if isinstance(data, str) and data.startswith(\"get_small_\"):\n        data = request.getfixturevalue(data)\n\n    assert data_is_scversedatastructure(data, data_type) == expected\n\n\n# -------------------------------------------------------------------------------------\n# Miscellaneous\n# -------------------------------------------------------------------------------------\n\n\ndef test_load_to_memory(tsv_file, zip_file, fcs_file, yaml_file):\n    # tsv\n    df = load_tsv(tsv_file)\n    assert isinstance(df, pd.DataFrame)\n    # fcs\n    adata = load_fcs(str(fcs_file))\n    assert isinstance(adata, ad.AnnData)\n    # error\n    with pytest.raises(NotImplementedError):\n        load_to_memory(zip_file)\n    # check that it is a path\n    assert isinstance(load_to_memory(\"./somefile.rds\"), UPath)\n    # yaml\n    dct = load_to_memory(yaml_file)\n    assert dct[\"a\"] == 1\n    assert dct[\"b\"] == 2\n\n    with pytest.raises(TypeError) as error:\n        ln.Artifact(True)\n    assert error.exconly() == \"TypeError: data has to be a string, Path, UPath\"\n\n\ndef test_bulk_delete():\n    report_path = Path(\"report.html\")\n    report_path.write_text(\"a\")\n    environment_path = Path(\"environment.txt\")\n    environment_path.write_text(\"c\")\n    report = ln.Artifact(report_path, description=\"Report\").save()\n    report_path.unlink()\n    report_path = report.path\n    environment = ln.Artifact(environment_path, description=\"requirement.txt\").save()\n    environment_path.unlink()\n    environment_path = environment.path\n\n    ln.Artifact.filter(id__in=[environment.id, report.id]).delete()\n\n    assert len(ln.Artifact.filter(id__in=[environment.id, report.id], branch_id=1)) == 0\n\n    # the 2 artifacts are in trash now\n    assert (\n        len(\n            ln.Artifact.filter(\n                id__in=[environment.id, report.id],\n                branch_id=-1,\n            )\n        )\n        == 2\n    )\n\n    ln.Artifact.filter(id__in=[environment.id, report.id], branch_id=-1).delete(\n        permanent=True\n    )\n    # now they're gone\n    assert (\n        len(\n            ln.Artifact.filter(\n                id__in=[environment.id, report.id],\n                branch_id=None,\n            )\n        )\n        == 0\n    )\n\n    assert not report_path.exists()\n    assert not environment_path.exists()\n\n\n@pytest.mark.parametrize(\"module_name\", [\"mudata\", \"spatialdata\"])\ndef test_no_unnecessary_imports(\n    example_dataframe: pd.DataFrame, module_name: str\n) -> None:\n    if module_name in sys.modules:\n        del sys.modules[module_name]\n\n    af = ln.Artifact.from_dataframe(example_dataframe, description=\"to delete\").save()\n\n    loaded_packages = []\n    for name, module in sys.modules.items():\n        if isinstance(module, ModuleType) and not name.startswith(\"_\"):\n            if \".\" not in name:\n                loaded_packages.append(name)\n\n    assert module_name not in sorted(loaded_packages)\n\n    # Cleanup and restore imports to ensure that other tests still run smoothly\n    af.delete(permanent=True)\n    import mudata  # noqa\n    import spatialdata  # noqa\n\n\ndef test_artifact_get_tracking(example_dataframe: pd.DataFrame):\n    artifact = ln.Artifact.from_dataframe(example_dataframe, key=\"df.parquet\").save()\n\n    transform = ln.Transform(key=\"test track artifact via get\").save()\n    run = ln.Run(transform).save()\n\n    assert (\n        ln.Artifact.get(key=\"df.parquet\", is_run_input=run) in run.input_artifacts.all()\n    )\n\n    artifact.delete(permanent=True)\n    transform.delete(permanent=True)\n\n\ndef test_get_by_path(example_dataframe: pd.DataFrame):\n    artifact = ln.Artifact.from_dataframe(example_dataframe, key=\"df.parquet\").save()\n    artifact_path = artifact.path\n\n    assert ln.Artifact.get(path=artifact_path) == artifact\n    assert ln.Artifact.filter().get(path=artifact_path.as_posix()) == artifact\n\n    with pytest.raises(ln.errors.ObjectDoesNotExist):\n        ln.Artifact.get(path=\"s3://bucket/folder/file.parquet\")\n\n    with pytest.raises(ValueError):\n        ln.User.get(path=\"some/path\")\n\n    artifact.delete(permanent=True)\n\n    path_str = \"s3://lamindb-ci/test-data/test.csv\"\n    storage = ln.Storage(ln.UPath(path_str).parent).save()\n\n    artifact = ln.Artifact(path_str, description=\"test get by path\").save()\n    assert not artifact._key_is_virtual\n    assert artifact._real_key is None\n    assert ln.Artifact.get(path=path_str) == artifact\n\n    artifact.delete(permanent=True, storage=False)\n\n    artifact = ln.Artifact(path_str, key=\"some_file.csv\").save()\n    assert artifact._key_is_virtual\n    assert artifact._real_key.endswith(\"test.csv\")\n    assert ln.Artifact.get(path=path_str) == artifact\n\n    artifact.delete(permanent=True, storage=False)\n\n    storage.delete()\n\n\ndef test_update_suffix_for_registered_storage_with_real_key(\n    registered_storage_file_and_folder,\n):\n    test_filepath, folder_path = registered_storage_file_and_folder\n    assert folder_path.exists() and folder_path.is_dir()\n\n    artifact = ln.Artifact(test_filepath, key=\"my_file.csv\").save()\n    assert artifact._real_key is not None\n    assert artifact.path.suffix == \".csv\"\n\n    source_path = artifact.path\n    artifact.suffix = \".tsv\"\n    with patch(\"builtins.input\", return_value=\"y\"):\n        artifact.save()\n\n    target_path = artifact.path\n    assert artifact.suffix == \".tsv\"\n    assert artifact.key is not None\n    assert artifact.key.endswith(\".tsv\")\n    assert artifact._real_key is not None\n    assert artifact._real_key.endswith(\".tsv\")\n    assert target_path.suffix == \".tsv\"\n    assert target_path.exists()\n    assert not source_path.exists()\n\n    artifact.delete(permanent=True, storage=False)\n\n\ndef test_update_suffix_for_registered_storage_folder_artifact(\n    registered_storage_file_and_folder,\n):\n    _, folder_path = registered_storage_file_and_folder\n    artifact = ln.Artifact(folder_path, key=\"dataset\").save()\n\n    assert artifact._real_key is not None\n    assert artifact.suffix == \"\"\n    assert artifact.path.exists()\n    assert artifact.path.is_dir()\n\n    source_path = artifact.path\n    artifact.suffix = \".zarr\"\n    with patch(\"builtins.input\", return_value=\"y\"):\n        artifact.save()\n\n    target_path = artifact.path\n    assert artifact.suffix == \".zarr\"\n    assert artifact.key is not None\n    assert artifact.key.endswith(\".zarr\")\n    assert artifact._real_key is not None\n    assert artifact._real_key.endswith(\".zarr\")\n    assert target_path.exists()\n    assert target_path.is_dir()\n    assert target_path.suffix == \".zarr\"\n    assert not source_path.exists()\n\n    artifact.delete(permanent=True, storage=False)\n\n\ndef test_update_non_virtual_key_for_registered_storage_file(\n    registered_storage_file_and_folder,\n):\n    test_filepath, _ = registered_storage_file_and_folder\n    artifact = ln.Artifact(test_filepath).save()\n    assert not artifact._key_is_virtual\n    assert artifact._real_key is None\n    assert artifact.key is not None\n\n    source_path = artifact.path\n    source_key = artifact.key\n    target_key = (\n        PurePosixPath(source_key)\n        .with_name(\"suffix_fixture_file_renamed.csv\")\n        .as_posix()\n    )\n    artifact.key = target_key\n    with patch(\"builtins.input\", return_value=\"n\"):\n        assert artifact.save() is None\n    assert source_path.exists()\n\n    artifact = ln.Artifact.get(uid=artifact.uid)\n    assert artifact.key == source_key\n    artifact.key = target_key\n    with patch(\"builtins.input\", return_value=\"y\"):\n        artifact.save()\n\n    target_path = artifact.path\n    assert artifact.key == target_key\n    assert target_path.exists()\n    assert not source_path.exists()\n\n    artifact.delete(permanent=True, storage=False)\n\n\ndef test_update_non_virtual_key_for_registered_storage_file_invalid_suffix(\n    registered_storage_file_and_folder,\n):\n    test_filepath, _ = registered_storage_file_and_folder\n    artifact = ln.Artifact(test_filepath).save()\n    assert artifact.key is not None\n\n    artifact.key = PurePosixPath(artifact.key).with_suffix(\".tsv\").as_posix()\n    with pytest.raises(InvalidArgument) as error:\n        artifact.save()\n    assert (\n        error.exconly()\n        == \"lamindb.errors.InvalidArgument: The suffix '.tsv' of the provided key is incorrect, it should be '.csv'.\"\n    )\n\n    artifact.delete(permanent=True, storage=False)\n\n\ndef test_update_key_to_none_raises_invalid_argument(\n    registered_storage_file_and_folder,\n):\n    test_filepath, _ = registered_storage_file_and_folder\n    artifact = ln.Artifact(test_filepath).save()\n    artifact.key = None\n\n    with pytest.raises(InvalidArgument) as error:\n        artifact.save()\n    assert (\n        error.exconly()\n        == \"lamindb.errors.InvalidArgument: Cannot update an artifact key to None.\"\n    )\n\n    artifact.delete(permanent=True, storage=False)\n\n\ndef test_update_non_virtual_key_before_save_raises_invalid_argument(tsv_file):\n    artifact = ln.Artifact(tsv_file, key=\"before-save.tsv\", _key_is_virtual=False)\n    artifact.key = \"after-edit.tsv\"\n\n    with pytest.raises(InvalidArgument) as error:\n        artifact.save()\n    assert (\n        error.exconly()\n        == \"lamindb.errors.InvalidArgument: Cannot update the key of an artifact before it is saved.\"\n    )\n\n\ndef test_update_non_virtual_key_in_unmanaged_storage_raises_invalid_argument():\n    url = (\n        \"https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md\"\n    )\n    artifact = ln.Artifact(url, description=\"test unmanaged key update\").save()\n    assert not artifact._key_is_virtual\n    artifact.key = \"laminlabs/lamindb/refs/heads/main/README-renamed.md\"\n    with pytest.raises(InvalidArgument) as error:\n        artifact.save()\n    assert (\n        error.exconly()\n        == \"lamindb.errors.InvalidArgument: Cannot update a non-virtual key of an artifact in a storage location that is not managed by the current instance.\"\n    )\n\n    artifact.delete(permanent=True, storage=False)\n\n\ndef test_create_artifact_in_foreign_managed_storage_raises_value_error(tsv_file):\n    storage = ln.settings.storage.record\n    with (\n        patch.object(storage, \"instance_uid\", \"_not_exists_\"),\n        pytest.raises(\n            ValueError,\n            match=(\n                \"Cannot create an artifact in a storage location that is not managed by the current instance.\"\n            ),\n        ),\n    ):\n        ln.Artifact(tsv_file, storage=storage)\n\n\ndef test_save_url_with_virtual_key_and_unmanaged_suffix_update_error():\n    url = (\n        \"https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md\"\n    )\n    key = \"folder/file.md\"\n    artifact = ln.Artifact(url, key=key).save()\n\n    assert artifact._real_key == \"laminlabs/lamindb/refs/heads/main/README.md\"\n    assert artifact.storage.instance_uid is None\n\n    cache_path_str = artifact._cache_path.as_posix()\n    assert not cache_path_str.startswith(\"http\")\n    assert cache_path_str.endswith(key)\n\n    artifact.suffix = \".txt\"\n    with pytest.raises(\n        InvalidArgument,\n        match=(\n            \"Cannot update the suffix of an artifact in a storage location \"\n            \"that is not managed by the current instance.\"\n        ),\n    ):\n        artifact.save()\n\n    artifact.delete(permanent=True, storage=False)\n\n\ndef test_change_space_for_artifact_in_foreign_managed_storage_raises_value_error(\n    tsv_file,\n):\n    artifact = ln.Artifact(tsv_file, key=\"space-change-foreign-storage.tsv\").save()\n    space = ln.Space(\n        name=\"test space change in foreign storage\", uid=\"foreignspace\"\n    ).save()\n    artifact.space = space\n    with (\n        patch.object(artifact.storage, \"instance_uid\", \"_not_exists_\"),\n        pytest.raises(\n            ValueError,\n            match=(\n                \"Cannot change the space of an artifact in a storage location that is not managed by the current instance.\"\n            ),\n        ),\n    ):\n        artifact.save()\n\n    artifact.delete(permanent=True)\n    space.delete(permanent=True)\n\n\ndef test_save_artifact_to_foreign_managed_storage_raises_value_error(tsv_file):\n    artifact = ln.Artifact(tsv_file, key=\"save-foreign-storage.tsv\")\n    with (\n        patch.object(artifact.storage, \"instance_uid\", \"_not_exists_\"),\n        pytest.raises(\n            ValueError,\n            match=(\n                \"Cannot save an artifact to a storage location that is not managed by the current instance.\"\n            ),\n        ),\n    ):\n        artifact.save()\n\n\ndef test_artifact_space_change(tsv_file):\n    artifact = ln.Artifact(tsv_file, key=\"test_space_change.tsv\").save()\n    space = ln.Space(name=\"test space change\", uid=\"00000234\").save()\n    # test after saving\n    artifact.space = space\n    with pytest.raises(ValueError) as err:\n        artifact.save()\n    assert (\n        \"No local storage locations managed by the current instance found for the space\"\n        in err.exconly()\n    )\n    # test after getting from the db\n    artifact = ln.Artifact.get(key=\"test_space_change.tsv\")\n    artifact.space = space\n    with pytest.raises(ValueError) as err:\n        artifact.save()\n    assert (\n        \"No local storage locations managed by the current instance found for the space\"\n        in err.exconly()\n    )\n\n    artifact.delete(permanent=True)\n    space.delete(permanent=True)\n\n\ndef test_passing_foreign_keys_ids(tsv_file):\n    transform = ln.Transform(key=\"test passings foreign keys ids\").save()\n    first_run = ln.Run(transform).save()\n    second_run = ln.Run(transform).save()\n\n    # check that passing a wrong type errors\n    with pytest.raises(AssertionError):\n        ln.Artifact(tsv_file, space=transform)\n\n    with pytest.raises(ValueError) as err:\n        ln.Artifact(tsv_file, run=first_run, run_id=first_run.id)\n    assert \"Do not pass both Run and its id at the same time.\" in err.exconly()\n\n    artifact = ln.Artifact(tsv_file, run=first_run, key=\"test_fk.tsv\").save()\n    artifact_id = artifact.id\n    assert artifact.run == first_run\n\n    artifact = ln.Artifact(tsv_file, run_id=second_run.id)  # same hash\n    assert artifact.id == artifact_id\n    assert artifact._subsequent_run_id == second_run.id\n    assert second_run in artifact.recreating_runs.all()\n\n    # Run-side: output_artifacts vs recreated_artifacts\n    assert list(first_run.output_artifacts.all()) == [artifact]\n    assert list(first_run.recreated_artifacts.all()) == []\n    assert list(second_run.output_artifacts.all()) == []\n    assert list(second_run.recreated_artifacts.all()) == [artifact]\n\n    # query_output_artifacts\n    assert list(first_run.query_output_artifacts(include_recreated=False)) == [artifact]\n    assert list(first_run.query_output_artifacts(include_recreated=True)) == [artifact]\n    assert list(second_run.query_output_artifacts(include_recreated=False)) == []\n    assert list(second_run.query_output_artifacts(include_recreated=True)) == [artifact]\n\n    artifact.delete(permanent=True)\n    second_run.delete(permanent=True)\n    first_run.delete(permanent=True)\n    transform.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_artifact_dataframe_with_curation.py",
    "content": "# Note: Almost all logic for schema-based validation is handled in the curators test suite\n# This here only covers external feature annotation and validation\n\nimport lamindb as ln\nimport pandas as pd\nimport pytest\n\n\n@pytest.fixture(scope=\"module\")\ndef two_internal_features():\n    feat1 = ln.Feature(name=\"feat1\", dtype=int).save()\n    feat2 = ln.Feature(name=\"feat2\", dtype=int).save()\n    yield feat1, feat2\n    feat1.delete(permanent=True)\n    feat2.delete(permanent=True)\n\n\n@pytest.fixture(scope=\"module\")\ndef two_external_features():\n    feature_a = ln.Feature(name=\"feature_a\", dtype=str).save()\n    feature_b = ln.Feature(name=\"feature_b\", dtype=str).save()\n    yield feature_a, feature_b\n    feature_a.delete(permanent=True)\n    feature_b.delete(permanent=True)\n\n\n@pytest.mark.parametrize(\"use_schema\", [True, False])\ndef test_create_artifact_with_external_feature_annotations(\n    use_schema: bool,\n    two_external_features: tuple[ln.Feature, ln.Feature],\n):\n    feat1, feat2 = two_external_features\n    if use_schema:\n        schema = ln.Schema(features=[feat1, feat2]).save()\n    else:\n        schema = None\n    artifact = ln.Artifact(\n        \".gitignore\",\n        key=\"test_file\",\n        features={\"feature_a\": \"x\", \"feature_b\": \"y\"},\n        schema=schema,\n    ).save()\n    assert artifact.features.get_values() == {\"feature_a\": \"x\", \"feature_b\": \"y\"}\n    assert artifact.schema == schema\n    # repeat to check idempotency (requires set_values() instead of add_values())\n    artifact = ln.Artifact(\n        \".gitignore\",\n        key=\"test_file\",\n        features={\"feature_a\": \"x\", \"feature_b\": \"y\"},\n        schema=schema,\n    ).save()\n    assert artifact.features.get_values() == {\"feature_a\": \"x\", \"feature_b\": \"y\"}\n    assert artifact.schema == schema\n    if use_schema:\n        with pytest.raises(ValueError) as error:\n            artifact.features.remove_values(\"feature_a\", value=\"x\")\n        assert (\n            \"Cannot remove values if artifact has external schema.\" in error.exconly()\n        )\n    else:\n        artifact.features.remove_values(\"feature_a\", value=\"x\")\n        assert artifact.features.get_values() == {\"feature_b\": \"y\"}\n    artifact.delete(permanent=True)\n    if use_schema:\n        schema.delete(permanent=True)\n\n\ndef test_artifact_from_dataframe_with_schema(example_dataframe: pd.DataFrame):\n    df = example_dataframe\n    feat1 = ln.Feature(name=\"feat1\", dtype=int).save()\n    artifact = ln.Artifact.from_dataframe(\n        df, key=\"test_df.parquet\", schema=\"valid_features\"\n    ).save()\n    # repeat to check idempotency\n    artifact = ln.Artifact.from_dataframe(\n        df, key=\"test_df.parquet\", schema=\"valid_features\"\n    ).save()\n    assert artifact.schema == ln.examples.schemas.valid_features()\n    assert artifact.features.get_values() == {}\n    assert (\n        artifact.features.describe(return_str=True)\n        == \"\"\"\\\nArtifact: test_df.parquet (0000)\n└── Dataset features\n    └── columns (1)\n        feat1               int\"\"\"\n    )\n    inferred_schema_link = artifact.schemas.through.get(artifact_id=artifact.id)\n    assert inferred_schema_link.slot == \"columns\"\n    assert inferred_schema_link.schema.members.count() == 1\n    assert inferred_schema_link.schema.members.first() == feat1\n    inferred_schema = inferred_schema_link.schema\n    inferred_schema_link.delete()\n    inferred_schema.delete(permanent=True)\n    feat1.delete(permanent=True)\n    artifact.delete(permanent=True)\n\n\ndef test_artifact_dataframe_with_features(example_dataframe: pd.DataFrame):\n    \"\"\"Test column names encoding when features with the same names are present.\"\"\"\n    artifact = ln.Artifact.from_dataframe(example_dataframe, key=\"df.parquet\").save()\n    id_feature = ln.Feature(name=\"id\", dtype=int).save()\n    uid_feature = ln.Feature(name=\"uid\", dtype=str).save()\n    artifact.features.add_values({\"id\": 1, \"uid\": \"test-uid\"})\n    df = ln.Artifact.filter(key=\"df.parquet\").to_dataframe(\n        include=[\"description\"], features=True\n    )\n    assert df.index.name == \"__lamindb_artifact_id__\"\n    assert df.columns.tolist() == [\n        \"__lamindb_artifact_uid__\",\n        \"key\",\n        \"id\",\n        \"uid\",\n        \"description\",\n    ]\n    assert df.iloc[0][\"id\"] == 1\n    assert df.iloc[0][\"uid\"] == \"test-uid\"\n\n    artifact.delete(permanent=True)\n    id_feature.delete(permanent=True)\n    uid_feature.delete(permanent=True)\n\n\ndef test_from_dataframe_with_external_schema(\n    example_dataframe: pd.DataFrame,\n    two_external_features: tuple[ln.Feature, ln.Feature],\n    two_internal_features: tuple[ln.Feature, ln.Feature],\n):\n    df = example_dataframe\n    feat1, feat2 = two_internal_features\n    featA, featB = two_external_features\n    schema_external = ln.Schema(features=[featA, featB]).save()\n\n    # Case 1: wrong internal features for this dataframe\n    schema_with_mistake = ln.Schema(\n        features=[featA, featB],\n        slots={\"__external__\": schema_external},\n        otype=\"DataFrame\",\n    ).save()\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact = ln.Artifact.from_dataframe(\n            df,\n            key=\"test_df_with_external_features.parquet\",\n            features={\"feature_a\": \"x\", \"feature_b\": \"y\"},\n            schema=schema_with_mistake,\n        ).save()\n    assert \"COLUMN_NOT_IN_DATAFRAME\" in error.exconly()\n\n    # alternative via DataFrameCurator directly\n    with pytest.raises(ln.errors.ValidationError) as error:\n        ln.curators.DataFrameCurator(\n            df,\n            schema=schema_with_mistake,\n        ).validate()\n    assert \"COLUMN_NOT_IN_DATAFRAME\" in error.exconly()\n\n    # Case 2: no schema for external features provided\n    schema_no_external = ln.Schema(features=[feat1, feat2]).save()\n    artifact = ln.Artifact.from_dataframe(\n        df,\n        key=\"test_df_with_external_features.parquet\",\n        features={\"feature_a\": \"x\", \"feature_b\": \"y\"},\n        schema=schema_no_external,\n    ).save()\n    assert artifact.features.get_values() == {\"feature_a\": \"x\", \"feature_b\": \"y\"}\n    artifact.delete(permanent=True)\n\n    # alternative via DataFrameCurator directly\n    curator = ln.curators.DataFrameCurator(\n        df,\n        schema=schema_no_external,\n        features={\"feature_a\": \"x\", \"feature_b\": \"y\"},\n    )\n    artifact = curator.save_artifact(\n        key=\"test_df_with_external_features.parquet\",\n    ).save()\n    assert artifact.features.get_values() == {\"feature_a\": \"x\", \"feature_b\": \"y\"}\n    artifact.delete(permanent=True)\n\n    # Case 3: correct external schema\n    schema_correct_external = ln.Schema(\n        features=[feat1, feat2],\n        slots={\"__external__\": schema_external},\n        otype=\"DataFrame\",\n    ).save()\n\n    # Case 3a: user passes no external features\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact = ln.Artifact.from_dataframe(\n            df,\n            key=\"test_df_with_external_features.parquet\",\n            schema=schema_correct_external,\n        ).save()\n    assert (\n        \"External features slot is defined in schema but no external features were provided.\"\n        in error.exconly()\n    )\n\n    # alternative via DataFrameCurator directly\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator = ln.curators.DataFrameCurator(\n            df,\n            schema=schema_correct_external,\n        )\n        artifact = curator.save_artifact(\n            key=\"test_df_with_external_features.parquet\",\n        ).save()\n    assert (\n        \"External features slot is defined in schema but no external features were provided.\"\n        in error.exconly()\n    )\n\n    # Case 3b: user provides external features\n    artifact = ln.Artifact.from_dataframe(\n        df,\n        key=\"test_df_with_external_features.parquet\",\n        features={\"feature_a\": \"x\", \"feature_b\": \"y\"},\n        schema=schema_correct_external,\n    ).save()\n    assert artifact.features.get_values() == {\"feature_a\": \"x\", \"feature_b\": \"y\"}\n    assert (\n        artifact.features.describe(return_str=True)\n        == \"\"\"\\\nArtifact: test_df_with_external_features.parquet (0000)\n├── Dataset features\n│   └── columns (2)\n│       feat1               int\n│       feat2               int\n└── External features\n    └── feature_a           str                      x\n        feature_b           str                      y\"\"\"\n    )\n    with pytest.raises(ValueError) as error:\n        artifact.features.remove_values(\"feature_a\", value=\"x\")\n    assert \"Cannot remove values if artifact has external schema.\" in error.exconly()\n    artifact.delete(permanent=True)\n\n    # alternative via DataFrameCurator directly\n    curator = ln.curators.DataFrameCurator(\n        df,\n        schema=schema_correct_external,\n        features={\"feature_a\": \"x\", \"feature_b\": \"y\"},\n    )\n    artifact = curator.save_artifact(\n        key=\"test_df_with_external_features.parquet\",\n    ).save()\n    assert artifact.features.get_values() == {\"feature_a\": \"x\", \"feature_b\": \"y\"}\n\n    # call this again to check calling with an existing artifact\n    curator = ln.curators.DataFrameCurator(\n        artifact,\n        schema=schema_correct_external,\n        features={\"feature_a\": \"z\", \"feature_b\": \"y\"},\n    )\n    artifact = curator.save_artifact(\n        key=\"test_df_with_external_features.parquet\",\n    ).save()\n    assert artifact.features.get_values() == {\"feature_a\": \"z\", \"feature_b\": \"y\"}\n\n    # call this again without passing features explicitly (they're already part of the artifact)\n    curator = ln.curators.DataFrameCurator(\n        artifact,\n        schema=schema_correct_external,\n    )\n    artifact = curator.save_artifact(\n        key=\"test_df_with_external_features.parquet\",\n    ).save()\n    assert artifact.features.get_values() == {\"feature_a\": \"z\", \"feature_b\": \"y\"}\n\n    # clean up everything\n    inferred_schema = artifact.schemas.all()[0]\n    artifact.schemas.remove(inferred_schema.id)\n    inferred_schema.delete(permanent=True)\n    artifact.delete(permanent=True)\n    schema_with_mistake.delete(permanent=True)\n    schema_no_external.delete(permanent=True)\n    schema_correct_external.delete(permanent=True)\n    schema_external.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_artifact_describe_to_dataframe.py",
    "content": "from datetime import date\n\nimport bionty as bt\nimport lamindb as ln\nimport numpy as np\nimport pandas as pd\nimport pytest\nfrom lamindb.models._describe import describe_postgres, describe_sqlite\n\n\ndef _check_df_equality(actual_df: pd.DataFrame, expected_df: pd.DataFrame) -> bool:\n    \"\"\"Checks equality between two DataFrames.\n\n    Special handling for columns containing sets and NaN values.\n    \"\"\"\n    # do not test indices by default\n    # pd.testing.assert_index_equal(actual_df.index, expected_df.index)\n    expected_df.index = actual_df.index\n    assert set(actual_df.columns) == set(expected_df.columns)\n    for col in expected_df.columns:\n        # Detect if column contains sets by checking first non-null value\n        first_value = next((v for v in expected_df[col] if pd.notna(v)), None)\n        is_set_column = isinstance(first_value, set)\n        if is_set_column:\n            # For set columns, compare sets with NaN handling\n            for idx in expected_df.index:\n                actual_val = actual_df.loc[idx, col]\n                expected_val = expected_df.loc[idx, col]\n                # If both are NaN, they're equal\n                if pd.isna(actual_val) and pd.isna(expected_val):\n                    continue\n                # If one is NaN and the other isn't, they're not equal\n                if pd.isna(actual_val) != pd.isna(expected_val):\n                    raise AssertionError(f\"NaN mismatch at index {idx} in column {col}\")\n                # If neither is NaN, compare the sets\n                assert actual_val == expected_val, (\n                    f\"Set mismatch at index {idx} in column {col}\"\n                )\n        else:\n            pd.testing.assert_series_equal(\n                actual_df[col],\n                expected_df[col],\n                check_names=False,  # ignore series names\n            )\n    return True\n\n\n# parallels the `registries` guide\n# please also see the test_querset.py tests\ndef test_describe_to_dataframe_example_dataset():\n    ln.examples.datasets.mini_immuno.save_mini_immuno_datasets()\n    artifact = ln.Artifact.get(key=\"examples/dataset1.h5ad\")\n    artifact2 = ln.Artifact.get(key=\"examples/dataset2.h5ad\")\n\n    with pytest.raises(ValueError) as error:\n        artifact.features.remove_values(\"cell_type_by_expert\")\n    assert \"Cannot remove values for dataset features.\" in error.exconly()\n\n    # Test df(include=[...])\n    df = (\n        ln.Artifact.filter(key__startswith=\"examples/dataset\", suffix=\".h5ad\")\n        .order_by(\"-key\")\n        .to_dataframe(include=[\"schemas__hash\", \"schemas__name\"])\n        .drop([\"uid\"], axis=1)\n    )\n    expected_data = {\n        \"key\": [\"examples/dataset2.h5ad\", \"examples/dataset1.h5ad\"],\n        \"schemas__hash\": [\n            set(artifact2.schemas.all().values_list(\"hash\", flat=True)),\n            set(artifact.schemas.all().values_list(\"hash\", flat=True)),\n        ],\n        \"schemas__name\": [{None}, {None}],\n    }\n    expected_df = pd.DataFrame(expected_data)\n    _check_df_equality(df, expected_df)\n\n    # Test df with features\n    # test that the records filter DOES NOT affect joining the annotations\n    # we want it to only affect the artifact query (even though here, it won't change the result as both artifacts have the IFNG label)\n    df = (\n        ln.Artifact.filter(\n            key__startswith=\"examples/dataset\",\n            suffix=\".h5ad\",\n            records__name=\"IFNG\",\n        )\n        .order_by(\"-key\")\n        .to_dataframe(\n            features=[\n                \"cell_type_by_expert\",\n                \"cell_type_by_model\",\n                \"experiment\",\n                \"perturbation\",\n                \"temperature\",\n                \"study_note\",\n                \"date_of_study\",\n            ]\n        )\n        .drop([\"uid\"], axis=1)\n    )\n    expected_data = {\n        \"key\": [\"examples/dataset2.h5ad\", \"examples/dataset1.h5ad\"],\n        \"cell_type_by_expert\": [np.nan, {\"CD8-positive, alpha-beta T cell\", \"B cell\"}],\n        \"cell_type_by_model\": [{\"T cell\", \"B cell\"}, {\"T cell\", \"B cell\"}],\n        \"experiment\": pd.Categorical([\"Experiment 2\", \"Experiment 1\"]),\n        \"perturbation\": [{\"IFNG\", \"DMSO\"}, {\"IFNG\", \"DMSO\"}],\n        \"temperature\": [22.6, 21.6],\n        \"study_note\": [\n            np.nan,\n            \"We had a great time performing this study and the results look compelling.\",\n        ],\n        \"date_of_study\": [date(2025, 2, 13), date(2024, 12, 1)],\n        \"study_metadata\": [\n            {\"detail1\": \"456\", \"detail2\": 2},\n            {\"detail1\": \"123\", \"detail2\": 1},\n        ],\n    }\n    expected_df = pd.DataFrame(expected_data)\n    _check_df_equality(df, expected_df)\n\n    # Test filtering artifacts by schemas__in (alternative approach)\n    # Query artifacts that measure CD8A gene by filtering schemas first\n    cd8a = bt.Gene.get(symbol=\"CD8A\")\n    schemas_with_cd8a = ln.Schema.filter(genes=cd8a)\n    df = ln.Artifact.filter(schemas__in=schemas_with_cd8a).to_dataframe()\n    assert set(df[\"key\"]) == {\"examples/dataset2.h5ad\", \"examples/dataset1.h5ad\"}\n    # check backward compat query with deprecation warning\n    with pytest.warns(\n        DeprecationWarning, match=\"Querying Artifact by `feature_sets` is deprecated\"\n    ):\n        df = ln.Artifact.filter(feature_sets__in=schemas_with_cd8a).to_dataframe()\n    assert set(df[\"key\"]) == {\"examples/dataset2.h5ad\", \"examples/dataset1.h5ad\"}\n\n    # expected output has italicized elements that can't be tested\n    # hence testing is restricted to section content, not headings\n    output = artifact.describe(return_str=True)\n    assert \"hash:\" in output\n    assert \"size:\" in output\n    assert \"schema:\" in output\n    assert \"n_observations: 3\" in output\n    assert \"storage/path:\" in output\n    assert \"created_by:\" in output\n    assert \"created_at:\" in output\n\n    # dataset section\n    assert (\n        artifact.features.describe(return_str=True)\n        == \"\"\"Artifact: examples/dataset1.h5ad (0000)\n├── Dataset features\n│   ├── obs (4)\n│   │   cell_type_by_expe…  bionty.CellType          B cell, CD8-positive, alph…\n│   │   cell_type_by_model  bionty.CellType          B cell, T cell\n│   │   perturbation        Record                   DMSO, IFNG\n│   │   sample_note         str\n│   └── var.T (3 bionty.G…\n│       CD14                num\n│       CD4                 num\n│       CD8A                num\n└── External features\n    └── experiment          Record                   Experiment 1\n        date_of_study       date                     2024-12-01\n        study_metadata      dict                     {'detail1': '123', 'detail…\n        study_note          str                      We had a great time perfor…\n        temperature         float                    21.6\"\"\"\n    )\n\n    # labels section\n    if ln.setup.settings.instance.dialect == \"postgresql\":\n        description_tree = describe_postgres(artifact)\n    else:\n        description_tree = describe_sqlite(artifact)\n    labels_node = description_tree.children[-1].label\n    assert labels_node.label.plain == \"Labels\"\n    assert len(labels_node.children[0].label.columns) == 3\n    assert len(labels_node.children[0].label.rows) == 2\n    assert labels_node.children[0].label.columns[0]._cells == [\n        \".records\",\n        \".cell_types\",\n    ]\n    assert labels_node.children[0].label.columns[1]._cells[0].plain == \"Record\"\n    assert labels_node.children[0].label.columns[1]._cells[1].plain == \"bionty.CellType\"\n    assert {\n        c.strip()\n        for c in \",\".join(labels_node.children[0].label.columns[2]._cells).split(\",\")\n    } == {\n        \"DMSO\",\n        \"IFNG\",\n        \"Experiment 1\",\n        \"B cell\",\n        \"T cell\",\n        \"CD8-positive\",\n        \"alpha-beta T cell\",\n    }\n\n    # set_values should only replace external features, not dataset-derived features\n    values_before = artifact.features.get_values()\n    adata = artifact.load()\n    just_internal = {\n        col: values_before[col] for col in adata.obs.columns if col in values_before\n    }\n    artifact.features.set_values({\"temperature\": 99.0})\n    values_after_set = artifact.features.get_values()\n    assert {col: values_after_set[col] for col in just_internal} == just_internal\n    assert values_after_set[\"temperature\"] == 99.0\n    assert set(values_after_set.keys()) == set(just_internal) | {\"temperature\"}\n\n    # test that only external feature are removed upon artifact.features.remove_values()\n    alljson_values = artifact.features.get_values()\n    artifact.features.remove_values()\n    assert just_internal != alljson_values\n    assert just_internal == artifact.features.get_values()\n\n    artifact.delete(permanent=True)\n    artifact2.delete(permanent=True)\n    ln.Schema.get(name=\"anndata_ensembl_gene_ids_and_valid_features_in_obs\").delete(\n        permanent=True\n    )\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n    bt.Gene.filter().delete(permanent=True)\n    ln.Record.filter().delete(permanent=True)\n    bt.CellType.filter().delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_artifact_features_annotations.py",
    "content": "# ruff: noqa: F811\n\nfrom datetime import date, datetime\n\nimport bionty as bt\nimport lamindb as ln\nimport pytest\nfrom lamindb.examples.datasets import mini_immuno\nfrom lamindb.models.query_set import BasicQuerySet, SQLRecordList\n\n\n# see test_record_basics.py for similar test for records (populate and query by features)\ndef test_artifact_features_add_remove_query():\n    record_type1 = ln.Record(name=\"RecordType1\", is_type=True).save()\n    record_entity1 = ln.Record(name=\"entity1\", type=record_type1).save()\n    record_entity2 = ln.Record(name=\"entity2\", type=record_type1).save()\n    ulabel = ln.ULabel(name=\"test-ulabel\").save()\n    artifact = ln.Artifact(\".gitignore\", key=\"test-artifact\").save()\n    transform = ln.Transform(key=\"test-transform\").save()\n    run = ln.Run(transform, name=\"test-run\").save()\n\n    feature_str = ln.Feature(name=\"feature_str\", dtype=str).save()\n    feature_list_str = ln.Feature(name=\"feature_list_str\", dtype=list[str]).save()\n    feature_int = ln.Feature(name=\"feature_int\", dtype=int).save()\n    feature_float = ln.Feature(name=\"feature_float\", dtype=float).save()\n    feature_num = ln.Feature(name=\"feature_num\", dtype=\"num\").save()\n    feature_datetime = ln.Feature(name=\"feature_datetime\", dtype=datetime).save()\n    feature_date = ln.Feature(\n        name=\"feature_date\", dtype=datetime.date, coerce=True\n    ).save()\n    feature_dict = ln.Feature(name=\"feature_dict\", dtype=dict).save()\n    feature_type1 = ln.Feature(name=\"feature_type1\", dtype=record_type1).save()\n    feature_type1s = ln.Feature(name=\"feature_type1s\", dtype=list[record_type1]).save()\n    feature_ulabel = ln.Feature(name=\"feature_ulabel\", dtype=ln.ULabel).save()\n    feature_user = ln.Feature(name=\"feature_user\", dtype=ln.User).save()\n    feature_project = ln.Feature(name=\"feature_project\", dtype=ln.Project).save()\n    feature_artifact = ln.Feature(name=\"feature_artifact\", dtype=ln.Artifact).save()\n    feature_artifact_2 = ln.Feature(name=\"feature_artifact_2\", dtype=ln.Artifact).save()\n    feature_run = ln.Feature(name=\"feature_run\", dtype=ln.Run.uid).save()\n    feature_cell_line = ln.Feature(name=\"feature_cell_line\", dtype=bt.CellLine).save()\n    ln.Feature(name=\"feature_cell_line_pass_list\", dtype=bt.CellLine).save()\n    feature_cell_lines = ln.Feature(\n        name=\"feature_cell_lines\", dtype=list[bt.CellLine]\n    ).save()\n    feature_cl_ontology_id = ln.Feature(\n        name=\"feature_cl_ontology_id\", dtype=bt.CellLine.ontology_id\n    ).save()\n    feature_gene_ontology_id = ln.Feature(\n        name=\"feature_gene_ontology_id\", dtype=bt.Gene.ensembl_gene_id\n    ).save()\n\n    test_artifact = ln.Artifact(\".gitignore\", key=\"test_artifact\").save()\n    value_artifact = ln.Artifact(\"pyproject.toml\", key=\"value_artifact.toml\").save()\n    test_project = ln.Project(name=\"test_project\").save()\n    hek293 = bt.CellLine.from_source(name=\"HEK293\").save()\n    a549 = bt.CellLine.from_source(name=\"A-549\").save()\n    gene1 = bt.Gene.from_source(ensembl_gene_id=\"ENSG00000139618\").save()\n    gene2 = bt.Gene.from_source(ensembl_gene_id=\"ENSG00000141510\").save()\n\n    # no schema validation\n\n    test_values = {\n        \"feature_str\": \"a string value\",\n        \"feature_list_str\": [\"value1\", \"value2\", \"value3\"],\n        \"feature_int\": 42,\n        \"feature_float\": 3.14,\n        \"feature_num\": 2.71,\n        \"feature_datetime\": datetime(2024, 1, 1, 12, 0, 0),\n        \"feature_date\": date(2024, 1, 1),\n        \"feature_dict\": {\"key\": \"value\", \"number\": 123, \"list\": [1, 2, 3]},\n        \"feature_type1\": \"entity1\",\n        \"feature_type1s\": [\"entity1\", \"entity2\"],\n        \"feature_ulabel\": \"test-ulabel\",\n        \"feature_user\": ln.setup.settings.user.handle,\n        \"feature_project\": \"test_project\",\n        \"feature_cell_line\": \"HEK293\",\n        # allowed if observational unit not specified, comes from aggregation\n        \"feature_cell_line_pass_list\": [\"HEK293\", \"A-549\"],\n        \"feature_cell_lines\": [\"HEK293\", \"A-549\"],\n        \"feature_cl_ontology_id\": \"CVCL_0045\",\n        \"feature_artifact\": \"test-artifact\",\n        \"feature_artifact_2\": \"value_artifact.toml\",\n        \"feature_run\": run.uid,\n    }\n\n    test_artifact.features.add_values(test_values)\n\n    # ManyToMany accessors\n    assert set(test_artifact.artifacts.to_list()) == {test_artifact, value_artifact}\n    assert set(value_artifact.linked_by_artifacts.to_list()) == {test_artifact}\n    assert set(test_artifact.linked_by_artifacts.to_list()) == {test_artifact}\n    assert value_artifact.artifacts.to_list() == []\n\n    # get_values accessor\n    return_values = test_artifact.features.get_values()\n\n    # special handling if passing a list of categories to a cat feature: it's interpreted as the result of an aggregation\n    # hence upon retrieval it's a set of categories, not a list of categories\n    values_pass_list = return_values.pop(\"feature_cell_line_pass_list\")\n    assert values_pass_list == set(test_values.pop(\"feature_cell_line_pass_list\"))\n    assert return_values == test_values\n\n    # __get_item__ accessor\n    assert test_artifact.features[\"feature_str\"] == test_values[\"feature_str\"]\n    assert test_artifact.features[\"feature_list_str\"] == test_values[\"feature_list_str\"]\n    assert test_artifact.features[\"feature_int\"] == test_values[\"feature_int\"]\n    assert test_artifact.features[\"feature_float\"] == test_values[\"feature_float\"]\n    assert test_artifact.features[\"feature_num\"] == test_values[\"feature_num\"]\n    assert test_artifact.features[\"feature_datetime\"] == test_values[\"feature_datetime\"]\n    assert test_artifact.features[\"feature_date\"] == test_values[\"feature_date\"]\n    assert test_artifact.features[\"feature_dict\"] == test_values[\"feature_dict\"]\n    assert test_artifact.features[\"feature_type1\"] == record_entity1\n    assert set(test_artifact.features[\"feature_type1s\"]) == {\n        record_entity1,\n        record_entity2,\n    }\n    assert test_artifact.features[\"feature_ulabel\"] == ulabel\n    assert (\n        test_artifact.features[\"feature_user\"].handle == ln.setup.settings.user.handle\n    )\n    assert test_artifact.features[\"feature_project\"] == test_project\n    assert test_artifact.features[\"feature_cell_line\"] == hek293\n    assert test_artifact.features[\"feature_cl_ontology_id\"] == hek293\n    value = test_artifact.features[\"feature_cell_line_pass_list\"]\n    assert set(value) == {hek293, a549}\n    assert isinstance(value, BasicQuerySet)\n    value = test_artifact.features[\"feature_cell_lines\"]\n    assert set(value) == {hek293, a549}\n    assert isinstance(value, SQLRecordList)\n    assert test_artifact.features[\"feature_artifact\"] == test_artifact\n    assert test_artifact.features[\"feature_artifact_2\"] == value_artifact\n    assert test_artifact.features[\"feature_run\"] == run\n\n    # --- Query by features (same data as above) ---\n    # Equality\n    assert ln.Artifact.filter(feature_str=\"a string value\").one() == test_artifact\n    assert ln.Artifact.filter(feature_int=42).one() == test_artifact\n    assert ln.Artifact.filter(feature_type1=\"entity1\").one() == test_artifact\n    assert ln.Artifact.filter(feature_cell_line=\"HEK293\").one() == test_artifact\n    assert (\n        ln.Artifact.filter(feature_str=\"a string value\", feature_int=42).one()\n        == test_artifact\n    )\n    # Datetime and date (filter uses ISO strings as stored in JSON)\n    assert (\n        ln.Artifact.filter(feature_datetime=\"2024-01-01T12:00:00\").one()\n        == test_artifact\n    )\n    assert ln.Artifact.filter(feature_date=\"2024-01-01\").one() == test_artifact\n    # __contains (categorical)\n    assert ln.Artifact.filter(feature_cell_line__contains=\"HEK\").one() == test_artifact\n    assert ln.Artifact.filter(feature_type1__contains=\"entity\").one() == test_artifact\n    # Invalid field\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        ln.Artifact.filter(feature_str_typo=\"x\", feature_int=42).one()\n    assert error.exconly().startswith(\n        \"lamindb.errors.InvalidArgument: You can query either by available fields:\"\n    )\n    # ln.errors.ObjectDoesNotExist (no object named \"nonexistent_entity\" exists)\n    with pytest.raises(ln.errors.ObjectDoesNotExist) as error:\n        ln.Artifact.filter(feature_type1=\"nonexistent_entity\").one()\n    assert \"Did not find\" in error.exconly()\n\n    # Combined filter (3 keys)\n    assert (\n        ln.Artifact.filter(\n            feature_str=\"a string value\",\n            feature_int=42,\n            feature_type1=\"entity1\",\n        ).one()\n        == test_artifact\n    )\n    # Bionty: filter by record\n    assert ln.Artifact.filter(feature_cell_line=hek293).one() == test_artifact\n    # Bionty: filter by ontology_id string\n    assert ln.Artifact.filter(feature_cl_ontology_id=\"CVCL_0045\").one() == test_artifact\n    # Bionty __contains (ontology_id)\n    assert (\n        ln.Artifact.filter(feature_cl_ontology_id__contains=\"0045\").one()\n        == test_artifact\n    )\n    # ln.errors.ObjectDoesNotExist (object not found: feature_project)\n    with pytest.raises(ln.errors.ObjectDoesNotExist) as error:\n        ln.Artifact.filter(feature_project=\"nonexistent_project\").one()\n    assert \"Did not find\" in error.exconly()\n    # __contains returns multiple (add second artifact, assert, then remove)\n    value_artifact.features.add_values({\"feature_type1\": \"entity2\"})\n    assert len(ln.Artifact.filter(feature_type1__contains=\"entity\")) == 2\n    value_artifact.features.remove_values(\"feature_type1\")\n    # Numeric comparators __lt, __gt (int, float, num)\n    assert ln.Artifact.filter(feature_int__lt=21).one_or_none() is None\n    assert len(ln.Artifact.filter(feature_int__gt=21)) >= 1\n    # int __lt/__gt that would fail with string comparison (42 vs 5, 42 vs 100)\n    assert ln.Artifact.filter(feature_int__lt=5).one_or_none() is None\n    assert ln.Artifact.filter(feature_int__gt=100).one_or_none() is None\n    # float/num __lt/__gt (numeric comparison on SQLite via json_extract + CAST)\n    assert ln.Artifact.filter(feature_float__lt=5.0).one() == test_artifact\n    assert ln.Artifact.filter(feature_float__gt=1.0).one() == test_artifact\n    assert ln.Artifact.filter(feature_float__gt=10.0).one_or_none() is None\n    assert ln.Artifact.filter(feature_num__lt=5.0).one() == test_artifact\n    assert ln.Artifact.filter(feature_num__gt=1.0).one() == test_artifact\n    assert ln.Artifact.filter(feature_num__gt=10.0).one_or_none() is None\n    # Date and datetime comparators (ISO strings)\n    assert ln.Artifact.filter(feature_date__lt=\"2024-01-02\").one() == test_artifact\n    assert ln.Artifact.filter(feature_date__gt=\"2023-12-31\").one() == test_artifact\n    assert ln.Artifact.filter(feature_date__gt=\"2024-01-02\").one_or_none() is None\n    assert (\n        ln.Artifact.filter(feature_datetime__lt=\"2024-01-01T13:00:00\").one()\n        == test_artifact\n    )\n    assert (\n        ln.Artifact.filter(feature_datetime__gt=\"2024-01-01T11:00:00\").one()\n        == test_artifact\n    )\n    assert (\n        ln.Artifact.filter(feature_datetime__lt=\"2024-01-01T11:00:00\").one_or_none()\n        is None\n    )\n\n    # remove values\n\n    # this was already popped from test_values above\n    test_artifact.features.remove_values(\"feature_cell_line_pass_list\")\n\n    test_artifact.features.remove_values(\"feature_int\")\n    test_values.pop(\"feature_int\")\n    test_artifact.features.remove_values(\"feature_float\")\n    test_values.pop(\"feature_float\")\n    test_artifact.features.remove_values(\"feature_num\")\n    test_values.pop(\"feature_num\")\n    assert test_artifact.features.get_values() == test_values\n\n    test_artifact.features.remove_values(\"feature_date\")\n    test_values.pop(\"feature_date\")\n    assert test_artifact.features.get_values() == test_values\n\n    test_artifact.features.remove_values(\"feature_type1\")\n    test_values.pop(\"feature_type1\")\n    assert test_artifact.features.get_values() == test_values\n\n    test_artifact.features.remove_values(\"feature_type1s\")\n    test_values.pop(\"feature_type1s\")\n    assert test_artifact.features.get_values() == test_values\n\n    test_artifact.features.remove_values(\"feature_ulabel\")\n    test_values.pop(\"feature_ulabel\")\n    assert test_artifact.features.get_values() == test_values\n\n    # test passing a list to remove_values\n\n    test_artifact.features.remove_values([\"feature_cell_line\", \"feature_user\"])\n    test_values.pop(\"feature_cell_line\")\n    test_values.pop(\"feature_user\")\n    assert test_artifact.features.get_values() == test_values\n\n    test_artifact.features.remove_values(\"feature_artifact\")\n    test_values.pop(\"feature_artifact\")\n    assert test_artifact.features.get_values() == test_values\n\n    test_artifact.features.remove_values(\"feature_run\")\n    test_values.pop(\"feature_run\")\n    assert test_artifact.features.get_values() == test_values\n\n    # test passing None has no effect, does not lead to annotation\n\n    test_artifact.features.add_values(\n        {\n            \"feature_int\": None,\n            \"feature_float\": None,\n            \"feature_num\": None,\n            \"feature_type1\": None,\n        }\n    )\n    assert test_artifact.features.get_values() == test_values\n\n    # test bulk removal\n\n    assert list(test_values.keys()) == [\n        \"feature_str\",\n        \"feature_list_str\",\n        \"feature_datetime\",\n        \"feature_dict\",\n        \"feature_project\",\n        \"feature_cell_lines\",\n        \"feature_cl_ontology_id\",\n        \"feature_artifact_2\",\n    ]\n    test_artifact.features.remove_values()\n    test_values = {}\n    assert test_artifact.features.get_values() == test_values\n\n    # test passing ISO-format date string for date\n\n    test_artifact.features.add_values({\"feature_date\": \"2024-01-01\"})\n    test_values[\"feature_date\"] = date(2024, 1, 1)\n    assert test_artifact.features.get_values() == test_values\n\n    # test passing bionty objects instead of strings (using gene1 and gene2 because organism-dependent ontologies)\n    test_artifact.features.add_values({\"feature_gene_ontology_id\": [gene1, gene2]})\n    test_values[\"feature_gene_ontology_id\"] = {\"ENSG00000139618\", \"ENSG00000141510\"}\n    assert test_artifact.features.get_values() == test_values\n    test_values.pop(\"feature_gene_ontology_id\")\n    test_artifact.features.remove_values(\"feature_gene_ontology_id\")\n\n    # test add_values() when there is already something there\n\n    test_artifact.features.add_values({\"feature_date\": \"2024-02-01\"})\n    test_values[\"feature_date\"] = {date(2024, 1, 1), date(2024, 2, 1)}\n    test_artifact.features.add_values({\"feature_str\": \"a string value\"})\n    test_values[\"feature_str\"] = \"a string value\"\n    assert test_artifact.features.get_values() == test_values\n\n    # test set_values()\n\n    test_values = {}\n    test_values[\"feature_date\"] = date(2024, 3, 1)\n    test_artifact.features.set_values({\"feature_date\": \"2024-03-01\"})\n    assert test_artifact.features.get_values() == test_values\n\n    # schema validation\n\n    feature_str = ln.Feature.get(name=\"feature_str\")\n    feature_int = ln.Feature.get(name=\"feature_int\")\n    schema = ln.Schema([feature_str, feature_int], name=\"test_schema\").save()\n    with pytest.raises(ln.errors.ValidationError) as error:\n        test_artifact.features.add_values({\"feature_type1\": \"entity1\"}, schema=schema)\n    assert \"COLUMN_NOT_IN_DATAFRAME\" in error.exconly()\n    schema.delete(permanent=True)\n\n    # test with list of strings\n\n    schema = ln.Schema([feature_cell_lines], name=\"test_schema2\").save()\n    test_artifact.features.add_values(\n        {\"feature_cell_lines\": [\"HEK293\", \"A-549\"]}, schema=schema\n    )\n    schema.delete(permanent=True)\n\n    # test with list of records (rather than passing strings)\n\n    schema = ln.Schema([feature_cell_lines], name=\"test_schema2\").save()\n    test_artifact.features.add_values(\n        {\"feature_cell_lines\": [a549, hek293]}, schema=schema\n    )\n    schema.delete(permanent=True)\n\n    # clean up rest\n\n    test_artifact.delete(permanent=True)\n    feature_str.delete(permanent=True)\n    feature_list_str.delete(permanent=True)\n    feature_int.delete(permanent=True)\n    feature_float.delete(permanent=True)\n    feature_num.delete(permanent=True)\n    feature_datetime.delete(permanent=True)\n    feature_date.delete(permanent=True)\n    feature_type1.delete(permanent=True)\n    feature_type1s.delete(permanent=True)\n    feature_user.delete(permanent=True)\n    feature_project.delete(permanent=True)\n    feature_dict.delete(permanent=True)\n    feature_artifact.delete(permanent=True)\n    feature_artifact_2.delete(permanent=True)\n    feature_run.delete(permanent=True)\n    feature_ulabel.delete(permanent=True)\n    feature_cell_lines.delete(permanent=True)\n    record_entity1.delete(permanent=True)\n    record_entity2.delete(permanent=True)\n    record_type1.delete(permanent=True)\n    test_project.delete(permanent=True)\n    feature_cell_line.delete(permanent=True)\n    feature_cl_ontology_id.delete(permanent=True)\n    feature_gene_ontology_id.delete(permanent=True)\n    hek293.delete(permanent=True)\n    a549.delete(permanent=True)\n    gene1.delete(permanent=True)\n    gene2.delete(permanent=True)\n    ulabel.delete(permanent=True)\n    artifact.delete(permanent=True)\n    run.delete(permanent=True)\n    transform.delete(permanent=True)\n\n\ndef test_features_name_duplicates_across_root_and_nested():\n    feature1 = ln.Feature(name=\"sample_name\", dtype=ln.Record).save()\n    lab_a_type = ln.Feature(name=\"LabA\", is_type=True).save()\n    feature2 = ln.Feature(name=\"sample_name\", dtype=ln.Record, type=lab_a_type).save()\n    record_sample = ln.Record(name=\"sample\").save()\n    test_artifact = ln.Artifact(\".gitignore\", key=\"test_artifact\").save()\n    test_artifact.features.add_values({\"sample_name\": \"sample\"})\n    assert test_artifact.features.get_values() == {\"sample_name\": \"sample\"}\n    test_artifact.delete(permanent=True)\n    record_sample.delete(permanent=True)\n    feature1.delete(permanent=True)\n    feature2.delete(permanent=True)\n    lab_a_type.delete(permanent=True)\n\n\n# also see test_curator_schema_feature_mapping\ndef test_features_name_duplicates_across_equal_levels():\n    lab_a_type = ln.Feature(name=\"LabA\", is_type=True).save()\n    feature1 = ln.Feature(name=\"sample_name\", dtype=ln.Record, type=lab_a_type).save()\n    lab_b_type = ln.Feature(name=\"LabB\", is_type=True).save()\n    feature2 = ln.Feature(name=\"sample_name\", dtype=ln.Record, type=lab_b_type).save()\n    schema1 = ln.Schema([feature1], name=\"Lab A schema\").save()\n    record_sample = ln.Record(name=\"sample\").save()\n    test_artifact = ln.Artifact(\".gitignore\", key=\"test_artifact\").save()\n\n    # cannot disambiguate without schema\n    with pytest.raises(ln.errors.ValidationError) as error:\n        test_artifact.features.add_values({\"sample_name\": \"sample\"})\n    assert (\n        \"Ambiguous match for Feature 'sample_name': found 2 features at depth 1 (under types: ['LabA', 'LabB'])\"\n        in error.exconly()\n    )\n\n    # with schema, first one\n    test_artifact.features.add_values({\"sample_name\": \"sample\"}, schema=schema1)\n    assert test_artifact.features.get_values() == {\"sample_name\": \"sample\"}\n    assert test_artifact.links_record.get().feature.type == lab_a_type\n\n    test_artifact.delete(permanent=True)\n    test_artifact = ln.Artifact(\".gitignore\", key=\"test_artifact\").save()\n\n    # now the other schema\n    schema2 = ln.Schema([feature2], name=\"Lab B schema\").save()\n    test_artifact.features.add_values({\"sample_name\": \"sample\"}, schema=schema2)\n    assert test_artifact.features.get_values() == {\"sample_name\": \"sample\"}\n    assert test_artifact.links_record.get().feature.type == lab_b_type\n\n    test_artifact.delete(permanent=True)\n    record_sample.delete(permanent=True)\n    schema2.delete(permanent=True)\n    schema1.delete(permanent=True)\n    feature1.delete(permanent=True)\n    feature2.delete(permanent=True)\n    lab_a_type.delete(permanent=True)\n    lab_b_type.delete(permanent=True)\n\n\ndef test_feature_predicate_queries_safe_hybrid():\n    lab_a_type = ln.Feature(name=\"PredLabA\", is_type=True).save()\n    feature_a = ln.Feature(name=\"pred_name\", dtype=str, type=lab_a_type).save()\n    lab_b_type = ln.Feature(name=\"PredLabB\", is_type=True).save()\n    feature_b = ln.Feature(name=\"pred_name\", dtype=str, type=lab_b_type).save()\n    score_feature = ln.Feature(name=\"pred_score\", dtype=int).save()\n    cell_type_feature = ln.Feature(name=\"pred_cell_type\", dtype=bt.CellLine).save()\n\n    # safe hybrid behavior for model identity + hashability\n    assert feature_a == feature_a\n    assert feature_a != feature_b\n    assert len({feature_a, feature_b}) == 2\n\n    schema_a = ln.Schema([feature_a], name=\"pred schema a\").save()\n    schema_b = ln.Schema([feature_b], name=\"pred schema b\").save()\n\n    artifact_a = ln.Artifact(\n        \".gitignore\",\n        key=\"pred-artifact-a\",\n        skip_hash_lookup=True,\n    ).save()\n    artifact_b = ln.Artifact(\n        \".gitignore\",\n        key=\"pred-artifact-b\",\n        skip_hash_lookup=True,\n    ).save()\n    artifact_a.features.add_values({\"pred_name\": \"hello\"}, schema=schema_a)\n    artifact_b.features.add_values({\"pred_name\": \"hello\"}, schema=schema_b)\n    artifact_a.features.add_values({\"pred_score\": 5})\n    artifact_b.features.add_values({\"pred_score\": 1})\n    hek293 = bt.CellLine.from_source(name=\"HEK293\").save()\n    artifact_a.features.add_values({\"pred_cell_type\": hek293})\n\n    # same feature name can be disambiguated by passing the Feature object\n    assert ln.Artifact.filter(feature_a == \"hello\").one() == artifact_a\n    assert ln.Artifact.filter(feature_b == \"hello\").one() == artifact_b\n    # Feature compared to another model should still generate a predicate\n    assert ln.Artifact.filter(cell_type_feature == hek293).one() == artifact_a\n\n    # comparator operators on non-categorical feature values\n    assert ln.Artifact.filter(score_feature > 2).one() == artifact_a\n    assert ln.Artifact.filter(score_feature <= 1).one() == artifact_b\n    neq_results = ln.Artifact.filter(score_feature != 5)\n    assert artifact_b in neq_results\n    assert artifact_a not in neq_results\n\n    # mixed predicate and regular kwargs filters\n    assert (\n        ln.Artifact.filter(feature_a == \"hello\", key=\"pred-artifact-a\").one()\n        == artifact_a\n    )\n\n    artifact_a.delete(permanent=True)\n    artifact_b.delete(permanent=True)\n    schema_a.delete(permanent=True)\n    schema_b.delete(permanent=True)\n    feature_a.delete(permanent=True)\n    feature_b.delete(permanent=True)\n    score_feature.delete(permanent=True)\n    cell_type_feature.delete(permanent=True)\n    lab_a_type.delete(permanent=True)\n    lab_b_type.delete(permanent=True)\n    hek293.delete(permanent=True)\n\n\ndef test_features_add_with_schema():\n    df = mini_immuno.get_dataset1(otype=\"DataFrame\")\n    artifact = ln.Artifact.from_dataframe(df, description=\"test dataset\").save()\n\n    species = ln.Feature(name=\"species\", dtype=\"str\").save()\n    split = ln.Feature(name=\"split\", dtype=\"str\").save()\n    schema = ln.Schema([species, split]).save()\n\n    with pytest.raises(ln.errors.ValidationError) as e:\n        artifact.features.add_values({\"doesnot\": \"exist\"}, schema=schema)\n    assert \"column 'split' not in dataframe\" in str(e.value)\n\n    artifact.features.add_values({\"species\": \"bird\", \"split\": \"train\"}, schema=schema)\n    artifact.save()\n\n    assert artifact.features.get_values() == {\"species\": \"bird\", \"split\": \"train\"}\n\n    artifact.delete(permanent=True)\n    schema.delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\ndef test_artifact_feature_cat_filters_schema_end_to_end():\n    schema_feature = ln.Feature(name=\"schema_filter_column_e2e\", dtype=str).save()\n    required_schema = ln.Schema(\n        name=\"required_schema_for_artifact_filter\",\n        features=[schema_feature],\n    ).save()\n    artifact_feature = ln.Feature(\n        name=\"input_artifact\",\n        dtype=ln.Artifact,\n        cat_filters={\"schema\": required_schema},\n    ).save()\n    container_artifact = ln.Artifact(\n        \".gitignore\",\n        key=\"container_for_artifact_schema_filter\",\n        skip_hash_lookup=True,\n    ).save()\n    artifact_without_schema = ln.Artifact(\n        \".gitignore\",\n        key=\"artifact_without_required_schema\",\n        skip_hash_lookup=True,\n    ).save()\n    artifact_with_schema = ln.Artifact(\n        \".gitignore\",\n        key=\"artifact_with_required_schema\",\n        schema=required_schema,\n        skip_hash_lookup=True,\n    ).save()\n\n    try:\n        with pytest.raises(ln.errors.ValidationError) as error:\n            container_artifact.features.add_values(\n                {\"input_artifact\": artifact_without_schema.key}\n            )\n        assert \"1 term not validated in feature 'input_artifact'\" in error.exconly()\n\n        container_artifact.features.add_values(\n            {\"input_artifact\": artifact_with_schema.key}\n        )\n        assert container_artifact.features[\"input_artifact\"] == artifact_with_schema\n    finally:\n        container_artifact.delete(permanent=True)\n        artifact_without_schema.delete(permanent=True)\n        artifact_with_schema.delete(permanent=True)\n        artifact_feature.delete(permanent=True)\n        required_schema.delete(permanent=True)\n        schema_feature.delete(permanent=True)\n\n\ndef test_features_add_remove_error_behavior():\n    \"\"\"Add/remove/validation behavior.\"\"\"\n    adata = ln.examples.datasets.anndata_with_obs()\n    artifact = ln.Artifact.from_anndata(adata, description=\"test\").save()\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact.features.add_values({\"experiment\": \"Experiment 1\"})\n    assert (\n        error.exconly()\n        == \"\"\"lamindb.errors.ValidationError: These keys could not be validated: ['experiment']\nHere is how to create a feature:\n\n  ln.Feature(name='experiment', dtype='cat ? str').save()\"\"\"\n    )\n    ln.Feature(name=\"experiment\", dtype=ln.Record).save()\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact.features.add_values({\"experiment\": \"Experiment 1\"})\n    assert error.exconly().startswith(\n        \"lamindb.errors.ValidationError: 1 term not validated in feature 'experiment'\"\n    )\n    ln.Record(name=\"Experiment 1\").save()\n    # now add the label with the feature and make sure that it has the feature annotation\n    artifact.features.add_values({\"experiment\": \"Experiment 1\"})\n    assert artifact.links_record.get().record.name == \"Experiment 1\"\n    assert artifact.links_record.get().feature.name == \"experiment\"\n    # repeat\n    artifact.features.add_values({\"experiment\": \"Experiment 1\"})\n    assert artifact.links_record.get().record.name == \"Experiment 1\"\n\n    # numerical feature\n    temperature = ln.Feature(name=\"temperature\", dtype=ln.Record).save()\n    with pytest.raises(TypeError) as error:\n        artifact.features.add_values({\"temperature\": 27.2})\n    assert error.exconly().startswith(\n        \"TypeError: Type mismatch: identifiers are 'numeric' but field_values are 'str/categorical'.\"\n    )\n    temperature.delete(permanent=True)\n    temperature = ln.Feature(name=\"temperature\", dtype=\"num\").save()\n    artifact.features.add_values({\"temperature\": 27.2})\n    assert artifact.json_values.first().value == 27.2\n\n    # datetime feature\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact.features.add_values({\"date_of_experiment\": \"2024-12-01\"})\n    assert (\n        error.exconly()\n        == \"\"\"lamindb.errors.ValidationError: These keys could not be validated: ['date_of_experiment']\nHere is how to create a feature:\n\n  ln.Feature(name='date_of_experiment', dtype='date').save()\"\"\"\n    )\n\n    ln.Feature(name=\"date_of_experiment\", dtype=datetime.date, coerce=True).save()\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact.features.add_values({\"date_of_experiment\": \"Typo2024-12-01\"})\n    assert \"WRONG_DATATYPE\" in error.exconly()\n    artifact.features.add_values({\"date_of_experiment\": \"2024-12-01\"})\n\n    ln.Feature(name=\"datetime_of_experiment\", dtype=datetime, coerce=True).save()\n    artifact.features.add_values({\"datetime_of_experiment\": \"2024-12-01 00:00:00\"})\n\n    # bionty feature\n    mouse = bt.Organism.from_source(name=\"mouse\")\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact.features.add_values({\"organism\": mouse})\n    assert (\n        error.exconly()\n        == \"\"\"lamindb.errors.ValidationError: These keys could not be validated: ['organism']\nHere is how to create a feature:\n\n  ln.Feature(name='organism', dtype='cat[bionty.Organism]').save()\"\"\"\n    )\n    ln.Feature(name=\"organism\", dtype=bt.Organism).save()\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact.features.add_values({\"organism\": mouse})\n    assert (\n        # ensure the label is saved\n        error.exconly()\n        == \"lamindb.errors.ValidationError: Organism mouse is not saved.\"\n    )\n    mouse.save()\n    artifact.features.add_values({\"organism\": mouse})\n    assert artifact.organisms.get().name == \"mouse\"\n\n    # lists of records\n    diseases = bt.Disease.from_values(\n        [\"MONDO:0004975\", \"MONDO:0004980\"], field=bt.Disease.ontology_id\n    ).save()\n    ln.Feature(name=\"disease\", dtype=bt.Disease.ontology_id).save()\n    artifact.features.add_values({\"disease\": diseases})\n    assert len(artifact.diseases.filter()) == 2\n    # check get_values returns ontology_ids as specified in the feature dtype\n    assert artifact.features.get_values()[\"disease\"] == {\n        \"MONDO:0004975\",\n        \"MONDO:0004980\",\n    }\n\n    # big dictionary of everything\n    features = {\n        \"experiment\": [  # we're testing iterable annotation here\n            \"Experiment 2\",\n            \"Experiment 1\",\n        ],\n        \"project\": \"project_1\",\n        \"is_validated\": True,\n        \"cell_type_by_expert\": \"T cell\",\n        \"temperature\": 100.0,\n        \"donor\": \"U0123\",\n    }\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact.features.add_values(features)\n    assert (\n        error.exconly()\n        == \"\"\"\\\nlamindb.errors.ValidationError: These keys could not be validated: ['project', 'is_validated', 'cell_type_by_expert', 'donor']\nHere is how to create a feature:\n\n  ln.Feature(name='project', dtype='cat ? str').save()\n  ln.Feature(name='is_validated', dtype='bool').save()\n  ln.Feature(name='cell_type_by_expert', dtype='cat ? str').save()\n  ln.Feature(name='donor', dtype='cat ? str').save()\"\"\"\n    )\n\n    ln.Feature(name=\"project\", dtype=ln.Record).save()\n    ln.Feature(name=\"is_validated\", dtype=bool).save()\n    ln.Feature(name=\"cell_type_by_expert\", dtype=bt.CellType).save()\n    ln.Feature(name=\"donor\", dtype=ln.Record).save()\n\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact.features.add_values(features)\n        error_msg = error.exconly()\n\n        assert (\n            \"lamindb.errors.ValidationError: These values could not be validated:\"\n            in error_msg\n        )\n        assert \"Here is how to create records for them:\" in error_msg\n\n        expected_values = {\n            \"Record\": [\"project_1\", \"U0123\", \"Experiment 2\"],\n            \"bionty.CellType\": [\"T cell\"],\n        }\n\n        for key, values in expected_values.items():\n            assert f\"'{key}':\" in error_msg\n            for value in values:\n                assert value in error_msg\n            assert f\"{key.split('.')[-1]}.from_values(\" in error_msg\n\n        assert \"create=True).save()\" in error_msg\n\n    ln.Record.from_values([\"Experiment 2\", \"project_1\", \"U0123\"], create=True).save()\n    bt.CellType.from_source(name=\"T cell\").save()\n\n    artifact.features.add_values(features)\n    assert set(artifact.json_values.all().values_list(\"value\", flat=True)) == {\n        27.2,\n        True,\n        100.0,\n        \"2024-12-01\",\n        \"2024-12-01T00:00:00\",\n    }\n\n    assert ln.Artifact.get(json_values__value=27.2)\n\n    assert artifact.features.get_values() == {\n        \"disease\": {\"MONDO:0004975\", \"MONDO:0004980\"},\n        \"experiment\": {\"Experiment 1\", \"Experiment 2\"},\n        \"project\": \"project_1\",\n        \"cell_type_by_expert\": \"T cell\",\n        \"donor\": \"U0123\",\n        \"organism\": \"mouse\",\n        \"is_validated\": True,\n        \"temperature\": {27.2, 100.0},\n        \"date_of_experiment\": date(2024, 12, 1),\n        \"datetime_of_experiment\": datetime(2024, 12, 1, 0, 0, 0),\n    }\n    # hard to test because of italic formatting\n    assert (\n        artifact.features.describe(return_str=True)\n        == \"\"\"Artifact:  (0000)\n|   description: test\n└── Features\n    └── cell_type_by_expe…  bionty.CellType          T cell\n        disease             bionty.Disease.ontolog…  MONDO:0004975, MONDO:00049…\n        donor               Record                   U0123\n        experiment          Record                   Experiment 1, Experiment 2\n        organism            bionty.Organism          mouse\n        project             Record                   project_1\n        date_of_experiment  date                     2024-12-01\n        datetime_of_exper…  datetime                 2024-12-01 00:00:00\n        is_validated        bool                     True\n        temperature         num                      27.2, 100.0\"\"\"\n    )\n\n    # repeat\n    artifact.features.add_values(features)\n    assert set(artifact.json_values.all().values_list(\"value\", flat=True)) == {\n        27.2,\n        True,\n        100.0,\n        \"2024-12-01\",\n        \"2024-12-01T00:00:00\",\n    }\n\n    # test remove_values\n    artifact.features.remove_values(\"date_of_experiment\")\n    alzheimer = bt.Disease.get(name=\"Alzheimer disease\")\n    artifact.features.remove_values(\"disease\", value=alzheimer)\n    values = artifact.features.get_values()\n    assert \"date_of_experiment\" not in values\n    assert \"MONDO:0004975\" not in values[\"disease\"]\n\n    # test annotate with dictionaries multiple times\n    ln.Feature(name=\"study_metadata\", dtype=dict).save()\n    artifact.features.add_values({\"study_metadata\": {\"detail1\": \"123\", \"detail2\": 1}})\n\n    # delete everything we created\n    artifact.delete(permanent=True)\n    ln.Record.filter().delete(permanent=True)\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n    bt.Gene.filter().delete(permanent=True)\n    bt.Organism.filter().delete(permanent=True)\n    bt.Disease.filter().delete(permanent=True)\n\n\ndef test_add_remove_list_features(ccaplog):\n    feature = ln.Feature(name=\"list_of_str\", dtype=list[str]).save()\n    artifact = ln.Artifact(\".gitignore\", key=\".gitignore\").save()\n    artifact.features.add_values({\"list_of_str\": [\"1\", \"2\", \"3\"]})\n    assert artifact.features.get_values() == {\"list_of_str\": [\"1\", \"2\", \"3\"]}\n    # remove a non-linked value, this should do nothing but print a warning\n    artifact.features.remove_values(\"list_of_str\", value=\"4\")\n    assert \"no feature 'list_of_str' with value '4' found\" in ccaplog.text\n    # list of categories feature\n    cell_types_feature = ln.Feature(\n        name=\"cell_types\", dtype=\"list[cat[bionty.CellType]]\"\n    ).save()\n    bt.CellType.from_values([\"T cell\", \"B cell\"]).save()\n    artifact.features.add_values({\"cell_types\": [\"T cell\", \"B cell\"]})\n    assert set(artifact.features.get_values()[\"cell_types\"]) == {\"B cell\", \"T cell\"}\n    # passing value works here because we are linking each of the cell types in the list individually\n    # in comparison to passing a list of numbers above\n    t_cell = bt.CellType.get(name=\"T cell\")\n    artifact.features.remove_values(\"cell_types\", value=t_cell)\n    assert artifact.features.get_values()[\"cell_types\"] == [\"B cell\"]\n    # remove a non-linked value, this should print a warning but do nothing\n    artifact.features.remove_values(\"cell_types\", value=t_cell.parents.first())\n    assert \"no feature 'cell_types' with value CellType(\" in ccaplog.text\n    # remove the entire linked feature\n    artifact.features.remove_values(\"cell_types\")\n    assert \"cell_types\" not in artifact.features.get_values()\n\n    # clean up\n    artifact.delete(permanent=True)\n    assert ln.models.JsonValue.filter(feature__name=\"list_of_str\").count() == 1\n    feature.delete(permanent=True)\n    assert ln.models.JsonValue.filter(feature__name=\"list_of_str\").count() == 0\n    cell_types_feature.delete(permanent=True)\n    bt.CellType.filter().delete(permanent=True)\n\n\ndef test_add_list_of_cat_features():\n    type_1 = ln.Record(name=\"Type 1\", is_type=True).save()\n    for label in [\"label 1\", \"label 2\", \"label 3\"]:\n        ln.Record(name=label, type=type_1).save()\n    feat1 = ln.Feature(\n        name=\"single_label_of_type1\", dtype=type_1, nullable=False\n    ).save()\n    feat2 = ln.Feature(\n        name=\"list_of_labels_of_type1\", dtype=list[type_1], nullable=False\n    ).save()\n    schema = ln.Schema(name=\"Test schema\", features=[feat1, feat2]).save()\n    artifact = ln.Artifact(\n        \".gitignore\",\n        key=\".gitignore\",\n    ).save()\n    # now just use add_values()\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact.features.add_values(\n            {\n                \"single_label_of_type1\": \"invalid\",\n            }\n        )\n    assert error.exconly().startswith(\n        \"lamindb.errors.ValidationError: 1 term not validated in feature 'single_label_of_type1': 'invalid'\"\n    )\n    # now for list of labels\n    with pytest.raises(ln.errors.ValidationError) as error:\n        artifact.features.add_values(\n            {\n                \"list_of_labels_of_type1\": [\"invalid\", \"invalid2\"],\n            }\n        )\n    assert error.exconly().startswith(\n        \"lamindb.errors.ValidationError: 2 terms not validated in feature 'list_of_labels_of_type1':\"\n    )\n    artifact.delete(permanent=True)\n    # now with schema\n    artifact = ln.Artifact(\n        \".gitignore\",\n        key=\".gitignore\",\n        schema=schema,\n        features={\n            \"single_label_of_type1\": \"label 1\",\n            \"list_of_labels_of_type1\": [\"label 1\", \"label 2\"],\n        },\n    ).save()\n    with pytest.raises(ValueError) as error:\n        artifact.features.add_values(\n            {\n                \"single_label_of_type1\": \"invalid\",\n            }\n        )\n    assert \"Cannot add values if artifact has external schema.\" in error.exconly()\n\n    artifact.delete(permanent=True)\n    schema.delete(permanent=True)\n    feat1.delete(permanent=True)\n    feat2.delete(permanent=True)\n    type_1.records.all().delete(permanent=True)\n    type_1.delete(permanent=True)\n\n\ndef test_artifact_features_accept_feature_object_keys():\n    feature_score = ln.Feature(name=\"artifact_feature_object_score\", dtype=int).save()\n    feature_tag = ln.Feature(name=\"artifact_feature_object_tag\", dtype=str).save()\n    artifact = ln.Artifact(\".gitignore\", key=\"artifact_feature_object_test\").save()\n\n    artifact.features.add_values({feature_score: 7, \"artifact_feature_object_tag\": \"a\"})\n    assert artifact.features.get_values() == {\n        \"artifact_feature_object_score\": 7,\n        \"artifact_feature_object_tag\": \"a\",\n    }\n\n    # set_values should also accept Feature objects as dictionary keys.\n    artifact.features.set_values({feature_score: 8})\n    assert artifact.features.get_values() == {\"artifact_feature_object_score\": 8}\n\n    artifact.features.add_values({feature_tag: \"keep\"})\n    assert artifact.features.get_values() == {\n        \"artifact_feature_object_score\": 8,\n        \"artifact_feature_object_tag\": \"keep\",\n    }\n\n    # remove_values supports dictionary inputs with Feature keys.\n    artifact.features.remove_values({feature_score: 8, feature_tag: None})\n    assert artifact.features.get_values() == {}\n\n    artifact.delete(permanent=True)\n    feature_score.delete(permanent=True)\n    feature_tag.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_artifact_parquet.py",
    "content": "import lamindb as ln\nimport pandas as pd\nimport pyarrow.parquet as pq\n\n\ndef test_parquet_kwargs():\n    df = pd.DataFrame(\n        {\n            \"a\": [3, 1, 4, 2],\n            \"b\": [\"c\", \"a\", \"d\", \"b\"],\n            \"c\": [3.3, 1.1, 4.4, 2.2],\n        }\n    )\n    df_sorted = df.sort_values(by=[\"a\", \"b\"])\n    sorting_columns = [\n        pq.SortingColumn(0, descending=False, nulls_first=False),\n        pq.SortingColumn(1, descending=False, nulls_first=False),\n    ]\n    artifact = ln.Artifact.from_dataframe(\n        df_sorted,\n        key=\"df_sorted.parquet\",\n        parquet_kwargs={\"sorting_columns\": sorting_columns},\n    ).save()\n    pyarrow_dataset = artifact.open()\n    fragment = next(pyarrow_dataset.get_fragments())\n    assert list(fragment.metadata.row_group(0).sorting_columns) == sorting_columns\n"
  },
  {
    "path": "tests/core/test_blocks.py",
    "content": "import lamindb as ln\nimport pytest\n\n\ndef test_block_recovery_based_on_hash():\n    block1 = ln.models.Block(key=\"__lamindb_block__\", content=\"1\", kind=\"readme\").save()\n    block2 = ln.models.Block(key=\"__lamindb_block__\", content=\"1\", kind=\"readme\")\n    assert block1 == block2\n    block1.delete()\n    block2 = ln.models.Block(key=\"__lamindb_block__\", content=\"1\", kind=\"readme\")\n    assert block1 != block2\n    block1.delete(permanent=True)\n\n\ndef test_block_recovery_based_on_key():\n    block1 = ln.models.Block(key=\"__lamindb_block__\", kind=\"readme\").save()\n    block2 = ln.models.Block(key=\"__lamindb_block__\", kind=\"readme\")\n    assert block1 == block2\n    block1.delete()\n    block2 = ln.models.Block(key=\"__lamindb_block__\", kind=\"readme\")\n    assert block1 != block2\n    block1.delete(permanent=True)\n\n\ndef test_readme_md_key_is_allowed_and_revises():\n    block1 = ln.models.Block(\n        key=\"README.md\", content=\"# v1\\n\\nhello\", kind=\"readme\"\n    ).save()\n    block2 = ln.models.Block(key=\"README.md\", content=\"# v2\\n\\nhello\", kind=\"readme\")\n    assert block2.stem_uid == block1.stem_uid\n    assert block2.uid != block1.uid\n    block2.save()\n    block1.refresh_from_db()\n    assert not block1.is_latest\n    block2.delete()\n    block1.delete()\n\n\ndef test_revise_blocks():\n    # attempt to create a block with an invalid version\n    with pytest.raises(ValueError) as error:\n        ln.models.Block(key=\"__lamindb_block__\", version=0, kind=\"readme\")\n    assert \"version\" in error.exconly() or \"version_tag\" in error.exconly()\n\n    # create a versioned block\n    block = ln.models.Block(key=\"__lamindb_block__\", version=\"1\", kind=\"readme\")\n    assert block.version_tag == \"1\"\n    assert block.version == \"1\"\n    assert len(block.uid) == ln.models.Block._len_full_uid == 20\n    assert len(block.stem_uid) == ln.models.Block._len_stem_uid == 16\n\n    block.save()\n\n    # try to reload the same block with the same uid\n    block_reload = ln.models.Block(\n        uid=block.uid, key=\"__lamindb_artifact__\", kind=\"readme\"\n    )\n    assert block_reload.id == block.id\n    assert block_reload.key == \"__lamindb_block__\"  # unchanged, prints logging\n\n    # create new block from old block\n    block_r2 = ln.models.Block(content=\"v2\", revises=block, kind=\"readme\")\n    assert block_r2.uid != block.uid\n    assert block_r2.uid.endswith(\"0001\")\n    block_r2 = ln.models.Block(content=\"v2\", revises=block, kind=\"readme\")\n    assert block_r2.uid != block.uid\n    assert block_r2.uid.endswith(\"0001\")\n    assert block_r2.stem_uid == block.stem_uid\n    assert block_r2.version_tag is None\n    assert block_r2.version == block_r2.uid[-4:]\n    assert block_r2.is_latest\n    assert block.is_latest\n    block_r2.save()\n    assert not block.is_latest\n\n    # create new block from newly versioned block\n    block_r3 = ln.models.Block(\n        content=\"v3\", revises=block_r2, version=\"2\", kind=\"readme\"\n    )\n    assert block_r3.stem_uid == block.stem_uid\n    assert block_r3.version_tag == \"2\"\n    assert block_r3.version == \"2\"\n\n    # revise by matching on key\n    key = \"__lamindb_artifact__\"\n    block_r2.key = key\n    block_r2.save()\n    assert block_r2.is_latest\n    block_r3 = ln.models.Block(content=\"v3\", key=key, version=\"2\", kind=\"readme\")\n    assert block_r3.uid[:-4] == block_r2.uid[:-4]\n    assert block_r3.uid != block_r2.uid  # new version after block_r2\n    block_r2.content = \"something else\"\n    block_r2.save()\n    block_r3 = ln.models.Block(content=\"v3\", key=key, version=\"2\", kind=\"readme\")\n    assert block_r3.uid[:-4] == block_r2.uid[:-4]\n    assert block_r3.uid != block_r2.uid  # yet another new version\n    assert block_r3.stem_uid == block_r2.stem_uid\n    assert block_r3.key == key\n    assert block_r3.version_tag == \"2\"\n    assert block_r3.version == \"2\"\n    assert block_r3.is_latest\n    assert block_r2.is_latest\n    assert block_r3._revises is not None\n    block_r3.save()\n    block_r2 = ln.models.Block.get(block_r2.uid)\n    assert not block_r2.is_latest\n\n    # wrong block type\n    with pytest.raises(TypeError) as error:\n        ln.models.Block(\n            key=\"__lamindb_block__\", revises=ln.Record(name=\"x\"), kind=\"readme\"\n        )\n    assert error.exconly().startswith(\"TypeError: `revises` has to be of type `Block`\")\n\n    # wrong kwargs\n    with pytest.raises(ValueError) as error:\n        ln.models.Block(key=\"__lamindb_block__\", x=1, kind=\"readme\")\n    assert \"can be passed\" in error.exconly() and \"x\" in error.exconly()\n\n    # kind required (Block only supports kind=\"readme\")\n    with pytest.raises(ValueError) as error:\n        ln.models.Block(key=\"__lamindb_block__\", content=\"y\")\n    assert \"kind\" in error.exconly() and \"readme\" in error.exconly()\n\n    # invalid kind (Block only supports readme)\n    with pytest.raises(ValueError) as error:\n        ln.models.Block(key=\"__lamindb_block__\", content=\"y\", kind=\"comment\")\n    assert \"readme\" in error.exconly() or \"Only kind\" in error.exconly()\n\n    # cleanup\n    block_r2.delete()\n    block.delete()\n\n    # unversioned block\n    block = ln.models.Block(key=\"__lamindb_block__\", kind=\"readme\")\n    assert block.version_tag is None\n    assert block.version == block.uid[-4:]\n    block.save()\n\n    # create new block from old block\n    new_block = ln.models.Block(content=\"new\", revises=block, kind=\"readme\")\n    assert block.version_tag is None\n    assert block.version == block.uid[-4:]\n    assert new_block.stem_uid == block.stem_uid\n    assert new_block.uid.endswith(\"0001\")\n    assert new_block.version_tag is None\n    assert new_block.version == new_block.uid[-4:]\n\n    block.delete(permanent=True)\n\n\ndef test_record_block_readme_always_new_version():\n    \"\"\"Readme always creates a new version (no content-hash dedup).\"\"\"\n    record = ln.Record(name=\"test-record-blocks\").save()\n    block1 = ln.models.RecordBlock(record=record, content=\"1\", kind=\"readme\").save()\n    block2 = ln.models.RecordBlock(record=record, content=\"1\", kind=\"readme\")\n    assert block1.stem_uid == block2.stem_uid\n    assert block1.uid != block2.uid  # new version each time\n    block1.delete()  # BaseSQLRecord has no soft delete; this is permanent\n    block2 = ln.models.RecordBlock(record=record, content=\"1\", kind=\"readme\")\n    assert block1 != block2  # block2 is a new block (block1 was removed)\n    record.delete(permanent=True)\n\n\ndef test_record_block_comment_always_new_block():\n    \"\"\"Comment always creates a new block (no versioning; revises not allowed).\"\"\"\n    record = ln.Record(name=\"test-record-blocks-comment\").save()\n    # Add readme and comments to test full describe\n    ln.models.RecordBlock(\n        record=record, content=\"# Overview\\n\\nTest readme.\", kind=\"readme\"\n    ).save()\n    # Comments never version: each creation is a new comment (new uid).\n    comment1 = ln.models.RecordBlock(\n        record=record, content=\"same text\", kind=\"comment\"\n    ).save()\n    comment2 = ln.models.RecordBlock(record=record, content=\"same text\", kind=\"comment\")\n    assert comment1.stem_uid != comment2.stem_uid  # always new comment, no dedup\n    # revises is not allowed for kind='comment'\n    with pytest.raises(ValueError) as error:\n        ln.models.RecordBlock(\n            record=record, content=\"a comment\", kind=\"comment\", revises=comment1\n        )\n    assert \"revises is not allowed for kind='comment'\" in error.exconly()\n\n    # Test full describe call with include=\"comments\"\n    result = record.describe(return_str=True, include=\"comments\")\n    assert \"README\" in result\n    assert \"comment by\" in result\n    assert \"same text\" in result\n\n    comment1.delete()\n    record.delete(permanent=True)\n\n\ndef test_record_block_recovery_based_on_record_and_kind():\n    record = ln.Record(name=\"test-record-blocks-key\").save()\n    block1 = ln.models.RecordBlock(record=record, kind=\"readme\").save()\n    block2 = ln.models.RecordBlock(record=record, kind=\"readme\")\n    assert block1 == block2\n    block1.delete()  # BaseSQLRecord has no soft delete; this is permanent\n    block2 = ln.models.RecordBlock(record=record, kind=\"readme\")\n    assert block1 != block2  # block2 is a new block (block1 was removed)\n    record.delete(permanent=True)\n\n\ndef test_revise_record_blocks():\n    record = ln.Record(name=\"test-record-revise\").save()\n\n    # create a versioned record block\n    block = ln.models.RecordBlock(\n        record=record, content=\"v1\", kind=\"readme\", version=\"1\"\n    )\n    assert block.version_tag == \"1\"\n    assert block.version == \"1\"\n    assert len(block.uid) == ln.models.RecordBlock._len_full_uid == 20\n    assert len(block.stem_uid) == ln.models.RecordBlock._len_stem_uid == 16\n    block.save()\n\n    # reload same block by uid\n    block_reload = ln.models.RecordBlock(record=record, uid=block.uid, kind=\"readme\")\n    assert block_reload.id == block.id\n\n    # create new block from old block\n    block_r2 = ln.models.RecordBlock(\n        record=record, content=\"v2\", kind=\"readme\", revises=block\n    )\n    assert block_r2.uid != block.uid\n    assert block_r2.uid.endswith(\"0001\")\n    assert block_r2.stem_uid == block.stem_uid\n    assert block_r2.is_latest\n    assert block.is_latest\n    block_r2.save()\n    assert not block.is_latest\n\n    # create new block from newly versioned block\n    block_r3 = ln.models.RecordBlock(\n        record=record, content=\"v3\", kind=\"readme\", revises=block_r2, version=\"2\"\n    )\n    assert block_r3.stem_uid == block.stem_uid\n    assert block_r3.version_tag == \"2\"\n    assert block_r3.version == \"2\"\n\n    # readme always creates a new version (no hash-based dedup)\n    block_r3.save()  # so next readme for this record gets revises=block_r3\n    block_same = ln.models.RecordBlock(record=record, content=\"v3\", kind=\"readme\")\n    assert block_same.stem_uid == block_r3.stem_uid\n    assert block_same.uid != block_r3.uid  # new version (0003)\n\n    # comment does not accept revises\n    with pytest.raises(ValueError) as error:\n        ln.models.RecordBlock(\n            record=record, content=\"a comment\", kind=\"comment\", revises=block\n        )\n    assert \"revises is not allowed for kind='comment'\" in error.exconly()\n\n    # wrong kwargs\n    with pytest.raises(ValueError) as error:\n        ln.models.RecordBlock(record=record, x=1)\n    assert \"can be passed\" in error.exconly()\n\n    # record required\n    with pytest.raises(ValueError) as error:\n        ln.models.RecordBlock(content=\"x\", kind=\"readme\")\n    assert \"record is required\" in error.exconly()\n\n    block_r2.delete()\n    block.delete()\n    record.delete(permanent=True)\n\n\ndef test_record_block_filter_respects_default_branch_scope():\n    main_branch = ln.Branch.get(name=\"main\")\n    ln.setup.switch(main_branch.name)\n\n    main_record = ln.Record(name=\"record-block-main\").save()\n    ln.models.RecordBlock(\n        record=main_record,\n        content=\"record-block-main-content\",\n        kind=\"readme\",\n        branch=main_branch,\n        created_on=main_branch,\n    ).save()\n\n    contrib = ln.Branch(name=\"record_block_scope_branch\").save()\n    ln.setup.switch(contrib.name)\n    contrib_record = ln.Record(name=\"record-block-contrib\").save()\n    contrib_block = ln.models.RecordBlock(\n        record=contrib_record,\n        content=\"record-block-contrib-content\",\n        kind=\"readme\",\n        branch=contrib,\n        created_on=contrib,\n    ).save()\n\n    assert (\n        ln.models.RecordBlock.filter(content=\"record-block-contrib-content\").count()\n        == 1\n    )\n\n    ln.setup.switch(main_branch.name)\n    assert (\n        ln.models.RecordBlock.filter(content=\"record-block-contrib-content\").count()\n        == 0\n    )\n\n    contrib_block.delete()\n    contrib_record.delete(permanent=True)\n    main_record.delete(permanent=True)\n    contrib.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_branches.py",
    "content": "import lamindb as ln\n\n\ndef testbranch_id():\n    # create a file with default branch_id\n    with open(\"./testbranch_id.txt\", \"w\") as f:\n        f.write(\"branch_id\")\n    artifact = ln.Artifact(\"./testbranch_id.txt\", description=\"testbranch_id\").save()\n    assert artifact.branch_id == 1\n\n    # create a collection from file\n    collection = ln.Collection(artifact, key=\"testbranch_id\").save()\n\n    # delete a collection will put both collection but not linked artifact in trash\n    collection.delete()\n    assert collection.ordered_artifacts[0].branch_id == 1\n    result = ln.Collection.filter(key=\"testbranch_id\")\n    assert len(result) == 0\n    result = ln.Collection.filter(key=\"testbranch_id\", branch_id=1)\n    assert len(result) == 0\n    result = ln.Collection.filter(key=\"testbranch_id\", branch_id=None)\n    assert len(result) == 1\n\n    # restore\n    collection.restore()\n    assert collection.branch_id == 1\n    assert collection.ordered_artifacts[0].branch_id == 1\n\n    # permanent delete\n    collection.delete(permanent=True)\n    result = ln.Artifact.filter(description=\"testbranch_id\", branch_id=None)\n    # also permanently deleted linked file\n    assert len(result) == 1\n"
  },
  {
    "path": "tests/core/test_can_curate.py",
    "content": "import bionty as bt\nimport lamindb as ln\nimport pytest\nfrom lamindb.errors import ValidationError\n\n\n# some validate tests are in test_queryset\ndef test_inspect():\n    ln.Schema.filter().delete(permanent=True)\n    bt.Gene.filter().delete(permanent=True)\n    result = bt.Gene.inspect(\"TCF7\", \"symbol\", organism=\"human\")\n    assert result.validated == []\n\n    bt.Gene.from_source(symbol=\"TCF7\", organism=\"human\").save()\n    result = bt.Gene.inspect(\"TCF7\", organism=\"human\")\n    assert bt.Gene.validate(\"TCF7\", organism=\"human\")\n    result = bt.Gene.inspect([\"TCF7\", \"ABC1\"], \"symbol\", organism=\"human\")\n    assert result.validated == [\"TCF7\"]\n\n    # clean up\n    bt.Gene.filter().delete(permanent=True)\n\n\n# if a record was added to the DB via a different source\n# it will still be validated because it's in the DB\ndef test_inspect_source():\n    source1 = bt.Source.get(entity=\"bionty.CellType\", name=\"cl\")\n    source2 = bt.CellType.add_source(source=\"cl\", version=\"2022-08-16\")\n    bt.CellType.from_source(name=\"T cell\", source=source1).save()\n    assert bt.CellType.inspect(\"T-cell\", source=source2, mute=True).synonyms_mapper == {\n        \"T-cell\": \"T cell\"\n    }\n    assert (\n        bt.CellType.inspect(\n            \"T-cell\", source=source2, mute=True, strict_source=True\n        ).synonyms_mapper\n        == {}\n    )\n    assert bt.CellType.validate(\"T cell\", source=source2, mute=True).sum() == 1\n    assert (\n        bt.CellType.validate(\n            \"T cell\", source=source2, mute=True, strict_source=True\n        ).sum()\n        == 0\n    )\n    assert bt.CellType.standardize(\"T-cell\", source=source2, mute=True) == \"T cell\"\n    # here still standardized because of bionty\n    assert (\n        bt.CellType.standardize(\"T-cell\", source=source2, mute=True, strict_source=True)\n        == \"T cell\"\n    )\n    bt.CellType.filter().delete(permanent=True)\n\n\ndef test_standardize():\n    # synonym not in the database\n    result = bt.Gene.standardize([\"ABC1\", \"PDCD1\"], organism=\"human\")\n    assert result == [\"HEATR6\", \"PDCD1\"]\n\n    result = bt.Gene.standardize(\n        [\"ABC1\", \"PDCD1\"], field=bt.Gene.symbol, organism=\"human\"\n    )\n    assert result == [\"HEATR6\", \"PDCD1\"]\n\n    mapper = bt.Gene.standardize(\n        [\"ABC1\", \"PDCD1\"], return_mapper=True, organism=\"human\"\n    )\n    assert mapper == {\"ABC1\": \"HEATR6\"}\n\n    # synonym already in the database\n    bt.Gene.from_source(symbol=\"LMNA\", organism=\"human\").save()\n    mapper = bt.Gene.standardize([\"ABC1\", \"LMN1\"], return_mapper=True, organism=\"human\")\n    assert mapper == {\"LMN1\": \"LMNA\", \"ABC1\": \"HEATR6\"}\n    assert bt.Gene.standardize([\"LMNA\"], organism=\"human\") == [\"LMNA\"]\n    assert bt.Gene.standardize(\"LMNA\", organism=\"human\") == \"LMNA\"\n    assert bt.Gene.standardize([\"LMN1\"], return_mapper=True, organism=\"human\") == {\n        \"LMN1\": \"LMNA\"\n    }\n\n\ndef test_standardize_from_source():\n    result = bt.Gene.standardize([\"ABC1\", \"PDCD1\"], from_source=False)\n    assert result == [\"ABC1\", \"PDCD1\"]\n\n\ndef test_add_remove_synonym():\n    bt.CellType.filter().delete(permanent=True)\n\n    # a registry that doesn't have a synonyms column\n    user = ln.User.get(handle=ln.setup.settings.user.handle)\n    with pytest.raises(NotImplementedError):\n        user.add_synonym(\"syn\")\n\n    cell_types = bt.CellType.from_values([\"T cell\", \"B cell\"], \"name\")\n    ln.save(cell_types)\n    tcell = bt.CellType.get(name=\"T cell\")\n    bcell = bt.CellType.get(name=\"B cell\")\n    tcell.add_synonym([\"my cell type\"])\n    tcell.add_synonym(\"\")\n    tcell.add_synonym([])\n    assert \"my cell type\" in tcell.synonyms\n    with pytest.raises(ValidationError):\n        bcell.add_synonym(\"my cell type\")\n    with pytest.raises(ValidationError):\n        tcell.add_synonym(\"my|celltype\")\n\n    tcell.remove_synonym(\"my cell type\")\n    assert \"my cell type\" not in tcell.synonyms\n\n    bcell.synonyms = None\n    bcell.save()\n    tcell.synonyms = None\n    tcell.save()\n    tcell.add_synonym(\"\")\n    tcell.add_synonym([\"\"])\n    tcell.add_synonym([])\n    tcell.add_synonym([\"my cell type\"])\n    tcell.add_synonym(\"\")\n    tcell.add_synonym([\"\"])\n    tcell.add_synonym([])\n    assert tcell.synonyms == \"my cell type\"\n    tcell.remove_synonym(\"my cell type\")\n\n    # clean up\n    bt.CellType.filter().delete(permanent=True)\n\n\ndef test_set_abbr():\n    bt.CellType.filter().delete(permanent=True)\n    bt.CellType(name=\"my cell type\").save()\n    record = bt.CellType.get(name=\"my cell type\")\n    # if abbr is name, do not add to synonyms\n    record.set_abbr(\"my cell type\")\n    assert record.abbr == \"my cell type\"\n    assert record.synonyms is None\n\n    record.set_abbr(\"myct\")\n    assert record.abbr == \"myct\"\n    assert \"myct\" in record.synonyms\n\n    source = bt.Source.filter(organism=\"human\").first()\n    with pytest.raises(AttributeError) as error:\n        source.set_abbr(\"abbr\")\n    assert (\n        error.exconly() == \"AttributeError: 'Source' object has no attribute 'set_abbr'\"\n    )\n\n    record.delete()\n\n\ndef test_validate_int():\n    result = ln.User.validate([1, 2, 3], field=ln.User.id)\n    assert result.sum() == 1\n\n\ndef test_synonym_mapping():\n    # only name field can be standardized\n    bt.Gene.from_source(symbol=\"TNFRSF4\", organism=\"human\").save()\n\n    result = bt.Gene.inspect(\n        [\"CD134\", \"TNFRSF4\"], field=bt.Gene.symbol, organism=\"human\"\n    )\n    assert result.synonyms_mapper == {\"CD134\": \"TNFRSF4\"}\n\n    result = bt.Gene.inspect(\n        [\"CD134\", \"TNFRSF4\"], field=bt.Gene.ensembl_gene_id, organism=\"human\"\n    )\n    assert result.synonyms_mapper == {}\n\n    bt.Gene.filter().delete(permanent=True)\n\n\ndef test_validate_called_on_object_raises_error():\n    \"\"\"Calling validate() on an object must raise TypeError.\"\"\"\n    label = ln.ULabel(name=\"test_label\").save()\n    with pytest.raises(TypeError) as error:\n        label.validate([\"test_value\"])\n    assert (\n        \"ULabel.validate() is a class method and must be called on the ULabel class, not on a ULabel object\"\n        in str(error.value)\n    )\n\n\ndef test_standardize_source():\n    \"\"\"When passing a specific source to standardize, any matched public records must come from the passed source.\"\"\"\n    # 'HANCESTRO:0006' in Hancestro 3.0 but 'HANCESTRO:0848' in later versions\n    assert (\n        bt.Ethnicity.standardize(\n            [\"South Asian\"],\n            field=\"name\",\n            return_field=\"ontology_id\",\n            source=bt.Source(\n                entity=\"bionty.Ethnicity\",\n                version=\"3.0\",\n                name=\"hancestro\",\n                organism=\"human\",\n            ),\n        )[0]\n        == \"HANCESTRO:0006\"\n    )\n"
  },
  {
    "path": "tests/core/test_collection.py",
    "content": "import re\n\nimport anndata as ad\nimport lamindb as ln\nimport numpy as np\nimport pandas as pd\nimport pytest\nfrom lamindb.errors import FieldValidationError\nfrom scipy.sparse import csc_matrix, csr_matrix\n\n\n@pytest.fixture(scope=\"module\")\ndef df():\n    return pd.DataFrame({\"feat1\": [1, 2], \"feat2\": [3, 4]})\n\n\n@pytest.fixture(scope=\"module\")\ndef adata():\n    return ad.AnnData(\n        X=np.array([[1, 2, 3], [4, 5, 6]]),\n        obs={\"feat1\": [\"A\", \"B\"]},\n        var=pd.DataFrame(index=[\"MYC\", \"TCF7\", \"GATA1\"]),\n        obsm={\"X_pca\": np.array([[1, 2], [3, 4]])},\n        raw={\"X\": np.array([[8, 9, 10, 11], [12, 13, 14, 15]])},\n    )\n\n\n@pytest.fixture(scope=\"module\")\ndef adata2():\n    return ad.AnnData(\n        X=np.array([[1, 2, 5], [4, 5, 8]]),\n        obs={\"feat1\": [\"A\", \"B\"]},\n        var=pd.DataFrame(index=[\"MYC\", \"TCF7\", \"GATA1\"]),\n        obsm={\"X_pca\": np.array([[1, 2], [3, 4]])},\n    )\n\n\ndef test_from_single_artifact(adata):\n    features = ln.Feature.from_dataframe(adata.obs)\n    validated = ln.Feature.validate(\n        [feature.name for feature in features], field=\"name\"\n    )\n    ln.save([feature for (feature, valid) in zip(features, validated) if valid])\n    artifact = ln.Artifact.from_anndata(adata, description=\"My adata\")\n    if not artifact._state.adding:\n        artifact.delete(permanent=True)  # make sure we get a fresh one\n        artifact = ln.Artifact.from_anndata(adata, description=\"My adata\")\n    with pytest.raises(ValueError) as error:\n        ln.Collection(artifact, key=\"Test\")\n    assert str(error.exconly()).startswith(\n        \"ValueError: Not all artifacts are yet saved, please save them\"\n    )\n    artifact.save()\n    with pytest.raises(ValueError) as error:\n        ln.Collection(artifact, artifact)\n    assert str(error.exconly()).startswith(\n        \"ValueError: Only one non-keyword arg allowed: artifacts\"\n    )\n    transform = ln.Transform(key=\"My test transform\").save()\n    run = ln.Run(transform).save()\n    collection = ln.Collection(artifact, key=\"My new collection\", run=run).save()\n    assert collection.run.input_artifacts.get() == artifact\n    collection.delete(permanent=True)\n    artifact.delete(permanent=True)\n    assert ln.Artifact.filter(id=artifact.id).one_or_none() is None\n\n\ndef test_edge_cases(df, ccaplog):\n    with pytest.raises(\n        FieldValidationError,\n        match=re.escape(\n            \"Only artifacts, key, description, meta, reference, reference_type, run, revises, skip_hash_lookup can be passed\"\n        ),\n    ) as error:\n        ln.Collection(df, invalid_param=1)\n\n    with pytest.raises(ValueError) as error:\n        ln.Collection(1, key=\"Invalid\")\n    assert str(error.exconly()).startswith(\n        \"ValueError: Artifact or list[Artifact] is allowed.\"\n    )\n\n    artifact = ln.Artifact.from_dataframe(df, description=\"Test artifact\")\n    assert artifact._state.adding\n    with pytest.raises(ValueError) as error:\n        ln.Collection([artifact])\n    assert str(error.exconly()).startswith(\n        \"ValueError: Not all artifacts are yet saved, please save them\"\n    )\n    artifact.save()\n    ln.Collection([artifact, artifact], key=\"test-collection\")\n    assert \"your collection contains artifacts with non-unique hashes:\" in ccaplog.text\n    artifact.delete(permanent=True)\n\n\ndef test_from_inconsistent_artifacts(df, adata):\n    artifact1 = ln.Artifact.from_dataframe(df, description=\"My test\").save()\n    artifact2 = ln.Artifact.from_anndata(adata, description=\"My test2\").save()\n    collection = ln.Collection([artifact1, artifact2], key=\"Inconsistent\").save()\n    # test idempotency of .save()\n    collection.save()\n    # create a run context\n    ln.track(transform=ln.Transform(key=\"My test transform\"))\n    # can iterate over them\n    collection.cache()\n    assert set(ln.context.run.input_collections.all()) == {collection}\n    # loading will throw an error here\n    with pytest.raises(ValueError) as error:\n        collection.load()\n    assert str(error.exconly()).startswith(\n        \"ValueError: Can only load collections where all artifacts have the same suffix\"\n    )\n    # test through query set\n    with pytest.raises(ValueError) as error:\n        collection.artifacts.all().load()\n    assert str(error.exconly()).startswith(\n        \"ValueError: Can only load collections where all artifacts have the same suffix\"\n    )\n    collection.describe()\n    collection.delete(permanent=True)\n    artifact1.delete(permanent=True)\n    artifact2.delete(permanent=True)\n    ln.context._run = None\n\n\ndef test_from_consistent_artifacts(adata, adata2):\n    artifact1 = ln.Artifact.from_anndata(adata, key=\"my_test.h5ad\").save()\n    artifact2 = ln.Artifact.from_anndata(adata2, key=\"my_test.h5ad\").save()\n    transform = ln.Transform(key=\"My test transform\").save()\n    run = ln.Run(transform).save()\n    initial_key = \"My test\"\n    collection = ln.Collection([artifact1, artifact2], key=initial_key, run=run)\n    assert collection._state.adding\n    collection.save()\n    assert set(collection.run.input_artifacts.all()) == {artifact1, artifact2}\n    adata_joined = collection.load()\n    assert \"artifact_uid\" in adata_joined.obs.columns\n    assert artifact1.uid in adata_joined.obs.artifact_uid.cat.categories\n    # test from query set through collection\n    adata_joined = collection.artifacts.order_by(\"-created_at\").load()\n    assert \"artifact_uid\" in adata_joined.obs.columns\n    assert artifact1.uid in adata_joined.obs.artifact_uid.cat.categories\n\n    # re-run with hash-based lookup\n    collection2 = ln.Collection([artifact1, artifact2], key=\"My test 1\", run=run)\n    assert collection2 == collection\n    assert collection2.key == \"My test 1\"  # key is updated\n\n    # skip hash lookup\n    collection2 = ln.Collection(\n        [artifact1, artifact2], key=\"My test 1\", run=run, skip_hash_lookup=True\n    )\n    assert collection2 != collection\n\n    # let hash uniqueness constraint fail and database return the existing record\n    collection2 = ln.Collection(\n        [artifact1, artifact2], key=initial_key, run=run, skip_hash_lookup=True\n    ).save()\n    assert collection2 == collection\n\n    # move to trash and then re-run\n    collection.delete()\n    collection2 = ln.Collection([artifact1, artifact2], key=\"My test 2\", run=run)\n    assert collection2 != collection\n    assert collection2.key == \"My test 2\"\n\n    collection.delete(permanent=True)\n    artifact1.delete(permanent=True)\n    artifact2.delete(permanent=True)\n\n\ndef test_mapped(adata, adata2):\n    # prepare test data\n    adata.strings_to_categoricals()\n    adata.obs[\"feat2\"] = adata.obs[\"feat1\"]\n    adata.layers[\"layer1\"] = adata.X.copy()\n    adata.layers[\"layer1\"][0, 0] = 0\n    artifact1 = ln.Artifact.from_anndata(adata, key=\"part_one.h5ad\").save()\n    adata2.X = csr_matrix(adata2.X)\n    adata2.layers[\"layer1\"] = adata2.X.copy()\n    adata2.obs[\"feat2\"] = adata2.obs[\"feat1\"]\n    artifact2 = ln.Artifact.from_anndata(\n        adata2, key=\"part_two.zarr\", format=\"zarr\"\n    ).save()\n    adata3 = adata2.copy()\n    adata3.var_names = [\"A\", \"B\", \"C\"]\n    adata3.obs.loc[\"0\", \"feat1\"] = np.nan\n    artifact3 = ln.Artifact.from_anndata(adata3, key=\"other_vars.h5ad\").save()\n    adata4 = adata.copy()\n    adata4.layers[\"layer1\"] = csc_matrix(adata4.layers[\"layer1\"])\n    artifact4 = ln.Artifact.from_anndata(adata4, description=\"csc layer\").save()\n    collection_outer = ln.Collection(\n        [artifact1, artifact2, artifact3], key=\"gather_outer\"\n    ).save()\n    collection_csc = ln.Collection([artifact4, artifact2], key=\"check_csc\").save()\n    collection = ln.Collection([artifact1, artifact2], key=\"gather\")\n    # test mapped without saving first\n    with collection.mapped() as ls_ds:\n        assert ls_ds.__class__.__name__ == \"MappedCollection\"\n    collection.save()\n\n    # test encoders\n    with pytest.raises(ValueError):\n        ls_ds = collection.mapped(encode_labels=[\"feat1\"])\n    with pytest.raises(ValueError):\n        ls_ds = collection.mapped(obs_keys=\"feat1\", encode_labels=[\"feat3\"])\n    with pytest.raises(ValueError):\n        ls_ds = collection.mapped(obs_keys=\"feat1\", unknown_label={\"feat3\": \"Unknown\"})\n    with collection.mapped(obs_keys=[\"feat1\", \"feat2\"], unknown_label=\"A\") as ls_ds:\n        assert ls_ds.encoders[\"feat1\"][\"A\"] == -1\n        assert ls_ds.encoders[\"feat1\"][\"B\"] == 0\n        assert ls_ds.encoders[\"feat2\"][\"A\"] == -1\n        assert ls_ds.encoders[\"feat2\"][\"B\"] == 0\n        assert ls_ds[0][\"feat1\"] == -1\n        assert ls_ds[1][\"feat1\"] == 0\n        assert ls_ds[0][\"feat2\"] == -1\n        assert ls_ds[1][\"feat2\"] == 0\n    with collection.mapped(\n        obs_keys=[\"feat1\", \"feat2\"], unknown_label={\"feat1\": \"A\"}\n    ) as ls_ds:\n        assert ls_ds.encoders[\"feat1\"][\"A\"] == -1\n        assert ls_ds.encoders[\"feat1\"][\"B\"] == 0\n        # categories in the encoder are sorted\n        A_enc = ls_ds.encoders[\"feat2\"][\"A\"]\n        assert A_enc == 0\n        B_enc = ls_ds.encoders[\"feat2\"][\"B\"]\n        assert B_enc == 1\n        assert ls_ds[0][\"feat1\"] == -1\n        assert ls_ds[1][\"feat1\"] == 0\n        assert ls_ds[0][\"feat2\"] == A_enc\n        assert ls_ds[1][\"feat2\"] == B_enc\n    with collection.mapped(\n        obs_keys=[\"feat1\", \"feat2\"], unknown_label=\"A\", encode_labels=[\"feat1\"]\n    ) as ls_ds:\n        assert ls_ds.encoders[\"feat1\"][\"A\"] == -1\n        assert ls_ds.encoders[\"feat1\"][\"B\"] == 0\n        assert \"feat2\" not in ls_ds.encoders\n        assert ls_ds[0][\"feat1\"] == -1\n        assert ls_ds[1][\"feat1\"] == 0\n        assert ls_ds[0][\"feat2\"] == \"A\"\n        assert ls_ds[1][\"feat2\"] == \"B\"\n\n    ls_ds = collection.mapped(obs_keys=\"feat1\")\n    assert not ls_ds.closed\n\n    assert len(ls_ds) == 4\n    assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3\n    assert len(ls_ds[0][\"X\"]) == 3\n    assert np.array_equal(ls_ds[2][\"X\"], np.array([1, 2, 5]))\n    weights = ls_ds.get_label_weights(\"feat1\")\n    assert len(weights) == 4\n    assert all(weights == 0.5)\n    weights = ls_ds.get_label_weights([\"feat1\", \"feat2\"])\n    assert len(weights) == 4\n    assert all(weights == 0.5)\n    weights = ls_ds.get_label_weights([\"feat1\", \"feat2\"], scaler=1.0)\n    assert all(weights == 1.0 / 3.0)\n    weights = ls_ds.get_label_weights(\n        [\"feat1\", \"feat2\"], scaler=1.0, return_categories=True\n    )\n    assert weights[\"A__A\"] == 1.0 / 3.0\n    assert weights[\"B__B\"] == 1.0 / 3.0\n\n    assert not ls_ds.check_vars_sorted(ascending=True)\n    assert not ls_ds.check_vars_sorted(ascending=False)\n    assert ls_ds.check_vars_non_aligned([\"MYC\", \"TCF7\", \"GATA1\"]) == []\n    ls_ds.var_list = None\n    assert not ls_ds.check_vars_sorted()\n    ls_ds.var_list = None\n    assert ls_ds.check_vars_non_aligned([\"MYC\", \"TCF7\", \"GATA1\"]) == []\n\n    ls_ds.close()\n    assert ls_ds.closed\n    del ls_ds\n\n    with collection.mapped(obs_keys=\"feat1\", join=\"inner\", dtype=\"float32\") as ls_ds:\n        assert not ls_ds.closed\n        assert len(ls_ds) == 4\n        assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3\n        assert str(ls_ds[0][\"X\"].dtype) == \"float32\"\n        assert str(ls_ds[2][\"X\"].dtype) == \"float32\"\n    assert ls_ds.closed\n\n    ls_ds = collection.mapped(obs_keys=\"feat1\", parallel=True)\n    assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3\n    assert ls_ds[0][\"_store_idx\"] == 0\n    assert ls_ds[2][\"_store_idx\"] == 1\n\n    ls_ds = collection.mapped(\n        layers_keys=[\"layer1\"], obsm_keys=[\"X_pca\"], obs_keys=\"feat1\"\n    )\n    assert np.array_equal(ls_ds[0][\"layer1\"], np.array([0, 2, 3]))\n    assert np.array_equal(ls_ds[2][\"layer1\"], np.array([1, 2, 5]))\n    assert np.array_equal(ls_ds[2][\"obsm_X_pca\"], np.array([1, 2]))\n    assert np.array_equal(ls_ds[3][\"obsm_X_pca\"], np.array([3, 4]))\n    assert ls_ds.shape == (4, 3)\n    assert ls_ds.original_shapes[0] == (2, 3) and ls_ds.original_shapes[1] == (2, 3)\n    ls_ds.close()\n    # keys not present in a store are ignored (omitted from output)\n    with collection.mapped(\n        obs_keys=[\"feat1\", \"feat_missing\"],\n        obsm_keys=[\"X_pca\", \"X_missing\"],\n        layers_keys=[\"X\", \"raw.X\"],\n    ) as ls_ds:\n        assert len(ls_ds) == 4\n        ls_ds_idx = ls_ds[0]\n        assert ls_ds_idx[\"X\"].shape == (3,)\n        assert ls_ds_idx[\"raw.X\"].shape == (4,)\n        assert \"feat1\" in ls_ds_idx\n        assert \"feat_missing\" not in ls_ds_idx\n        assert \"obsm_X_pca\" in ls_ds_idx\n        assert \"obsm_X_missing\" not in ls_ds_idx\n        assert \"raw.X\" not in ls_ds[2]\n    # test with QuerySet\n    query_set = ln.Artifact.filter(key__in=[\"part_one.h5ad\", \"part_two.zarr\"])\n    with query_set.mapped() as ls_ds:\n        assert ls_ds.shape == (4, 3)\n    with query_set.order_by(\"created_at\").mapped(stream=True) as ls_ds:\n        assert ls_ds.shape == (4, 3)\n\n    with collection.mapped(obs_keys=\"feat1\", stream=True) as ls_ds:\n        assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3\n\n    with pytest.raises(ValueError):\n        with collection_outer.mapped(obs_keys=\"feat1\", join=\"inner\"):\n            pass\n\n    with collection_outer.mapped(\n        layers_keys=\"X\", obsm_keys=\"X_pca\", obs_keys=\"feat1\", join=\"outer\"\n    ) as ls_ds:\n        assert ls_ds.shape == (6, 6)\n        assert ls_ds.join_vars == \"outer\"\n        assert len(ls_ds.var_joint) == 6\n        assert len(ls_ds[0]) == 4\n        assert len(ls_ds[0][\"X\"]) == 6\n        assert np.array_equal(ls_ds[0][\"X\"], np.array([0, 0, 0, 3, 1, 2]))\n        assert np.array_equal(ls_ds[1][\"X\"], np.array([0, 0, 0, 6, 4, 5]))\n        assert np.array_equal(ls_ds[2][\"X\"], np.array([0, 0, 0, 5, 1, 2]))\n        assert np.array_equal(ls_ds[3][\"X\"], np.array([0, 0, 0, 8, 4, 5]))\n        ls_ds_idx = ls_ds[4]\n        assert np.array_equal(ls_ds_idx[\"X\"], np.array([1, 2, 5, 0, 0, 0]))\n        assert ls_ds_idx[\"feat1\"] is np.nan\n        assert np.array_equal(ls_ds[5][\"X\"], np.array([4, 5, 8, 0, 0, 0]))\n        assert np.issubdtype(ls_ds[2][\"X\"].dtype, np.integer)\n        assert np.issubdtype(ls_ds[4][\"X\"].dtype, np.integer)\n        assert np.array_equal(ls_ds[3][\"obsm_X_pca\"], np.array([3, 4]))\n        assert ls_ds.check_vars_non_aligned([\"MYC\", \"TCF7\", \"GATA1\"]) == [2]\n        assert not ls_ds.check_vars_sorted()\n        assert len(ls_ds.get_label_weights(\"feat1\")) == 6\n\n    with collection_outer.mapped(layers_keys=\"layer1\", join=\"outer\") as ls_ds:\n        assert np.array_equal(ls_ds[0][\"layer1\"], np.array([0, 0, 0, 3, 0, 2]))\n        assert np.array_equal(ls_ds[4][\"layer1\"], np.array([1, 2, 5, 0, 0, 0]))\n\n    # csc matrix in layers\n    with pytest.raises(ValueError):\n        collection_csc.mapped(layers_keys=\"layer1\")\n\n    # test with obs_filter\n    # tuple as obs_filter is deprecated, test anyways for now\n    with collection.mapped(obs_filter=(\"feat1\", (\"A\", \"B\"))) as ls_ds:\n        assert ls_ds.shape == (4, 3)\n        assert np.array_equal(ls_ds[1][\"X\"], np.array([4, 5, 6]))\n        assert np.array_equal(ls_ds[3][\"X\"], np.array([4, 5, 8]))\n        weights = ls_ds.get_label_weights(\"feat1\")\n        assert len(weights) == 4\n        assert all(weights == 0.5)\n    # tuple as obs_filter is deprecated, test anyways for now\n    with collection.mapped(obs_filter=(\"feat1\", \"B\")) as ls_ds:\n        assert ls_ds.shape == (2, 3)\n        assert np.array_equal(ls_ds[0][\"X\"], np.array([4, 5, 6]))\n        assert np.array_equal(ls_ds[1][\"X\"], np.array([4, 5, 8]))\n        weights = ls_ds.get_label_weights(\"feat2\")\n        assert len(weights) == 2\n        assert all(weights == 0.5)\n\n    with collection.mapped(obs_filter={\"feat1\": \"B\", \"feat2\": (\"A\", \"B\")}) as ls_ds:\n        assert ls_ds.shape == (2, 3)\n        assert ls_ds.original_shapes == [(1, 3), (1, 3)]\n        assert np.array_equal(ls_ds[0][\"X\"], np.array([4, 5, 6]))\n        assert np.array_equal(ls_ds[1][\"X\"], np.array([4, 5, 8]))\n        weights = ls_ds.get_label_weights(\"feat2\")\n        assert len(weights) == 2\n        assert all(weights == 0.5)\n    # nan in filtering values\n    with collection_outer.mapped(obs_filter={\"feat1\": np.nan}, join=\"outer\") as ls_ds:\n        assert ls_ds.shape == (1, 6)\n        assert np.array_equal(ls_ds[0][\"X\"], np.array([1, 2, 5, 0, 0, 0]))\n    with collection_outer.mapped(\n        obs_filter={\"feat1\": (np.nan,), \"feat2\": [\"A\", \"B\"]}, join=\"outer\"\n    ) as ls_ds:\n        assert ls_ds.shape == (1, 6)\n    with collection_outer.mapped(\n        obs_filter={\"feat1\": (np.nan, \"A\", \"B\")}, join=\"outer\"\n    ) as ls_ds:\n        assert ls_ds.shape == (6, 6)\n    with collection_outer.mapped(\n        obs_filter={\"feat1\": [\"A\", \"B\"]}, join=\"outer\"\n    ) as ls_ds:\n        assert ls_ds.shape == (5, 6)\n    with collection_outer.mapped(\n        obs_filter={\"feat1\": (\"A\", np.nan)}, join=\"outer\"\n    ) as ls_ds:\n        assert ls_ds.shape == (3, 6)\n\n    collection.delete(permanent=True)\n    collection_outer.delete(permanent=True)\n    collection_csc.delete(permanent=True)\n    artifact1.delete(permanent=True)\n    artifact2.delete(permanent=True)\n    artifact3.delete(permanent=True)\n    artifact4.delete(permanent=True)\n\n\ndef test_revise_collection(df, adata):\n    # create a versioned collection\n    artifact = ln.Artifact.from_dataframe(df, description=\"test\").save()\n    collection = ln.Collection(artifact, key=\"test-collection\", version=\"1\")\n    assert collection.version_tag == \"1\"\n    assert collection.version == \"1\"\n    assert collection.uid.endswith(\"0000\")\n    collection.save()\n\n    artifact = ln.Artifact.from_anndata(adata, description=\"test\").save()\n\n    with pytest.raises(ValueError) as error:\n        collection_r2 = ln.Collection(artifact, revises=collection, version=\"1\")\n    assert (\n        error.exconly()\n        == \"ValueError: Please change the version tag or leave it `None`, '1' is already taken\"\n    )\n\n    with pytest.raises(TypeError):\n        ln.Collection(adata, revises=\"wrong-type\")\n\n    # create new collection from old collection\n    collection_r2 = ln.Collection(artifact, key=\"test-collection\")\n    assert collection_r2.stem_uid == collection.stem_uid\n    assert collection_r2.uid.endswith(\"0001\")\n    # repeat\n    collection_r2 = ln.Collection(artifact, key=\"test-collection\")\n    assert collection_r2.stem_uid == collection.stem_uid\n    assert collection_r2.uid.endswith(\"0001\")\n    assert collection_r2.version_tag is None\n    assert (\n        collection_r2.version == collection_r2.uid[-4:]\n    )  # version falls back to uid suffix\n    assert collection_r2.key == \"test-collection\"\n\n    collection_r2.save()\n\n    # create new collection from newly versioned collection\n    df.iloc[0, 0] = 0\n    artifact = ln.Artifact.from_dataframe(df, description=\"test\")\n    artifact.save()\n    collection_r3 = ln.Collection(\n        artifact,\n        key=\"test-collection\",\n        description=\"test description3\",\n        version=\"2\",\n    )\n    assert collection_r3.stem_uid == collection.stem_uid\n    assert collection_r3.version_tag == \"2\"\n    assert collection_r3.version == \"2\"\n    assert collection_r3.uid.endswith(\"0002\")\n    assert collection_r3.key == \"test-collection\"\n    assert collection_r3.description == \"test description3\"\n\n    artifacts_r2 = collection_r2.artifacts.all()\n    collection_r2.delete(permanent=True)\n    artifacts_r2.delete(permanent=True)\n    artifacts = collection.artifacts.all()\n    collection.delete(permanent=True)\n    artifacts.delete(permanent=True)\n\n\ndef test_collection_append(df, adata):\n    artifact = ln.Artifact.from_dataframe(df, description=\"test\").save()\n    artifact_1 = ln.Artifact.from_anndata(adata, description=\"test\").save()\n    collection = ln.Collection(artifact, key=\"Test\", description=\"Test append\").save()\n    new_collection = collection.append(artifact_1).save()\n\n    assert new_collection.key == collection.key\n    assert new_collection.description == collection.description\n    assert new_collection.uid.endswith(\"0001\")\n    artifacts = new_collection.artifacts.all()\n    assert len(artifacts) == 2\n\n    new_collection.versions.delete(permanent=True)\n    artifacts.delete(permanent=True)\n\n\ndef test_with_metadata(df, adata):\n    meta_artifact = ln.Artifact.from_dataframe(df, description=\"test\")\n    meta_artifact.save()\n    data_artifact = ln.Artifact.from_anndata(adata, description=\"test adata\")\n    data_artifact.save()\n    collection = ln.Collection(\n        data_artifact, key=\"test collection\", meta_artifact=meta_artifact\n    )\n    collection.save()\n\n    assert collection.meta_artifact == meta_artifact\n    assert collection.data_artifact == data_artifact\n    collection.delete(permanent=True)\n    data_artifact.delete(permanent=True)\n    meta_artifact.delete(permanent=True)\n\n\ndef test_collection_get_tracking(df):\n    artifact = ln.Artifact.from_dataframe(df, key=\"df.parquet\").save()\n    collection = ln.Collection(artifact, key=\"track-collection\").save()\n\n    transform = ln.Transform(key=\"test track collection via get\").save()\n    run = ln.Run(transform).save()\n\n    assert (\n        ln.Collection.get(key=\"track-collection\", is_run_input=run)\n        in run.input_collections.all()\n    )\n\n    collection.delete(permanent=True)\n    artifact.delete(permanent=True)\n    transform.delete(permanent=True)\n\n\ndef test_describe_collection(adata, capsys):\n    artifact = ln.Artifact(adata, description=\"test\").save()\n    collection = ln.Collection(artifact, key=\"test\").save()\n    collection.describe()\n    captured = capsys.readouterr()\n    assert len(captured.out) > 50\n    assert \"collection\" in captured.out.lower()\n\n    # test describing from a remote postgres instance with less modules\n    collection = ln.Collection.connect(\"laminlabs/lamin-dev\").first()\n    collection.describe()\n    captured = capsys.readouterr()\n    assert len(captured.out) > 50\n    assert \"collection\" in captured.out.lower()\n"
  },
  {
    "path": "tests/core/test_curator_basics.py",
    "content": "import re\nimport textwrap\n\nimport bionty as bt\nimport lamindb as ln\nimport pandas as pd\nimport pytest\nfrom lamindb.core.exceptions import ValidationError\n\n\ndef _strip_ansi(text: str) -> str:\n    \"\"\"Remove ANSI escape sequences from a string.\"\"\"\n    ansi_escape = re.compile(r\"\\x1B(?:[@-Z\\\\-_]|\\[[0-?]*[ -/]*[@-~])\")\n    return ansi_escape.sub(\"\", text)\n\n\n@pytest.fixture\ndef df() -> pd.DataFrame:\n    return pd.DataFrame(\n        {\n            \"sample_id\": [\"sample1\", \"sample2\"],\n            \"sample_name\": [\"Sample 1\", \"Sample 2\"],\n            \"sample_type\": [\"Type A\", \"Type B\"],\n        }\n    )\n\n\n@pytest.fixture\ndef df_missing_sample_type_column() -> pd.DataFrame:\n    return pd.DataFrame(\n        {\n            \"sample_id\": [\"sample1\", \"sample2\"],\n            \"sample_name\": [\"Sample 1\", \"Sample 2\"],\n        }\n    )\n\n\n@pytest.fixture\ndef df_missing_sample_name_column() -> pd.DataFrame:\n    return pd.DataFrame(\n        {\n            \"sample_id\": [\"sample1\", \"sample2\"],\n            \"sample_type\": [\"Type A\", \"Type B\"],\n        }\n    )\n\n\n@pytest.fixture\ndef df_changed_col_order() -> pd.DataFrame:\n    return pd.DataFrame(\n        {\n            \"sample_name\": [\"Sample 1\", \"Sample 2\"],\n            \"sample_type\": [\"Type A\", \"Type B\"],\n            \"sample_id\": [\"sample1\", \"sample2\"],\n        }\n    )\n\n\n@pytest.fixture\ndef df_extra_column() -> pd.DataFrame:\n    return pd.DataFrame(\n        {\n            \"sample_id\": [\"sample1\", \"sample2\"],\n            \"sample_name\": [\"Sample 1\", \"Sample 2\"],\n            \"sample_type\": [\"Type A\", \"Type B\"],\n            \"extra_column\": [\"Extra 1\", \"Extra 2\"],\n        }\n    )\n\n\n@pytest.fixture\ndef df_disease() -> pd.DataFrame:\n    return pd.DataFrame(\n        {\n            \"disease\": pd.Categorical(\n                [\n                    # Only after 2025 mondo\n                    \"HDAC4-related haploinsufficiency syndrome\",\n                    \"SAMD9L-related spectrum and myeloid neoplasm risk\",\n                    # Already before 2025 mondo\n                    \"essential hypertension\",\n                    \"essential hypertension\",\n                    \"asthma\",\n                ]\n            ),\n        }\n    )\n\n\n@pytest.fixture\ndef disease_ontology_old() -> bt.Source:\n    return bt.Disease.add_source(\n        bt.Source.connect(\"laminlabs/bionty-assets\")\n        .get(entity=\"bionty.Disease\", version=\"2024-08-06\", organism=\"all\")\n        .save()\n    )\n\n\n@pytest.fixture(scope=\"module\")\ndef lists_df():\n    return pd.DataFrame(\n        {\n            \"sample_id\": [[\"sample1\", \"sample2\"], [\"sample2\"], [\"sample3\"]],\n            \"dose\": [[1.2, 2.3], [1.2], [2.3]],\n            \"cell_type\": [[\"B cell\", \"T cell\"], [\"B cell\"], [\"T cell\"]],\n            \"tissue\": [[\"blood\", \"pulmo\"], [\"blood\"], [\"lung\"]],\n        }\n    )\n\n\n@pytest.fixture(scope=\"module\")\ndef cat_df():\n    return pd.DataFrame(\n        {\n            \"sample_id\": [[\"sample1\", \"sample2\"], [\"sample2\"], [\"sample3\"]],\n            \"dose\": [[1.2, 2.3], [1.2], [2.3]],\n            \"cell_type\": [[\"B cell\", \"T cell\"], [\"B cell\"], [\"T cell\"]],\n            \"tissue\": [\"blood\", \"blood\", \"lung\"],\n        }\n    )\n\n\ndef test_curator_df_multivalue(lists_df, cat_df):\n    feature1 = ln.Feature(name=\"sample_id\", dtype=list[str]).save()\n    feature2 = ln.Feature(name=\"dose\", dtype=list[float]).save()\n    feature3 = ln.Feature(name=\"cell_type\", dtype=list[str]).save()\n    feature4 = ln.Feature(name=\"tissue\", dtype=list[bt.Tissue]).save()\n    schema = ln.Schema(\n        name=\"lists schema cat\",\n        features=[\n            feature1,\n            feature2,\n            feature3,\n            feature4,\n        ],\n    ).save()\n\n    curator = ln.curators.DataFrameCurator(lists_df, schema)\n    with pytest.raises(ValidationError):\n        curator.validate()\n    assert curator.cat._cat_vectors.keys() == {\"columns\", \"tissue\"}\n    assert curator.cat._cat_vectors[\"tissue\"]._validated == [\"blood\", \"lung\"]\n    assert curator.cat._cat_vectors[\"tissue\"]._non_validated == [\"pulmo\"]\n    assert curator.cat._cat_vectors[\"tissue\"]._synonyms == {\"pulmo\": \"lung\"}\n\n    curator.cat.standardize(\"tissue\")\n    assert curator.cat._cat_vectors[\"tissue\"]._non_validated == []\n    assert lists_df[\"tissue\"].tolist() == [[\"blood\", \"lung\"], [\"blood\"], [\"lung\"]]\n\n    assert curator.validate() is None\n\n    # test with cat_df which has a non-list tissue\n    curator = ln.curators.DataFrameCurator(cat_df, schema)\n    with pytest.raises(ValidationError):\n        curator.validate()\n\n    schema.delete(permanent=True)\n    feature1.delete(permanent=True)\n    feature2.delete(permanent=True)\n    feature3.delete(permanent=True)\n    feature4.delete(permanent=True)\n\n\ndef test_curators_list_feature_nullable_empty_list():\n    \"\"\"Test that a list feature that is nullable can accept empty lists.\"\"\"\n    feature_list = ln.Feature(\n        name=\"list_tissue\", dtype=list[bt.Tissue.ontology_id], nullable=True\n    ).save()\n    feature_int = ln.Feature(name=\"feature int\", dtype=int, nullable=True).save()\n    schema = ln.Schema(\n        name=\"test_list_feature_schema\",\n        features=[feature_list, feature_int],\n        coerce=True,\n    ).save()\n\n    df = pd.DataFrame({\"list_tissue\": [], \"feature int\": []})\n    ln.curators.DataFrameCurator(df, schema).validate()\n\n    # clean up\n    schema.delete(permanent=True)\n    feature_list.delete(permanent=True)\n    feature_int.delete(permanent=True)\n\n\ndef test_curator__repr__(df):\n    feature = ln.Feature(name=\"sample_id\", dtype=\"str\").save()\n    schema = ln.Schema(\n        name=\"sample schema\",\n        features=[feature],\n    ).save()\n    curator = ln.curators.DataFrameCurator(df, schema)\n\n    expected_repr = textwrap.dedent(\"\"\"\\\n    DataFrameCurator(Schema: sample schema, unvalidated)\n    \"\"\").strip()\n\n    actual_repr = _strip_ansi(repr(curator))\n    print(actual_repr)\n    assert actual_repr.strip() == expected_repr.strip()\n\n    schema.delete(permanent=True)\n    feature.delete(permanent=True)\n\n\n@pytest.mark.parametrize(\n    \"model_class\",\n    [ln.ULabel, ln.Record],\n)\ndef test_df_curator_typed_categorical(model_class):\n    # root level\n    sample_root_type = model_class(name=\"Sample\", is_type=True).save()\n    for name in [\"s1\", \"s2\"]:\n        model_class(name=name, type=sample_root_type).save()\n\n    # lab A level\n    lab_a_type = model_class(name=\"LabA\", is_type=True).save()\n    sample_a_type = model_class(name=\"Sample\", is_type=True, type=lab_a_type).save()\n    for name in [\"s3\", \"s4\"]:\n        model_class(name=name, type=sample_a_type).save()\n\n    # lab B level\n    lab_b_type = model_class(name=\"LabB\", is_type=True).save()\n    sample_b_type = model_class(name=\"Sample\", is_type=True, type=lab_b_type).save()\n    for name in [\"s5\", \"s6\"]:\n        model_class(name=name, type=sample_b_type).save()\n\n    df = pd.DataFrame(\n        {\n            \"biosample_name\": pd.Categorical([\"s1\", \"s2\", \"s3\", \"s4\", \"s5\", \"s6\"]),\n        }\n    )\n\n    feature = ln.Feature(name=\"biosample_name\", dtype=sample_a_type).save()\n    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert \"4 terms not validated in feature 'biosample_name':\" in error.exconly()\n    assert set(curator.cat._cat_vectors[\"biosample_name\"]._validated) == {\n        \"s3\",\n        \"s4\",\n    }\n    assert set(curator.cat._cat_vectors[\"biosample_name\"]._non_validated) == {\n        \"s1\",\n        \"s2\",\n        \"s5\",\n        \"s6\",\n    }\n\n    # Move LabB under LabA\n    lab_b_type.type = lab_a_type\n    lab_b_type.save()\n    feature.delete(permanent=True)  # re-create the feature with the new dtype\n    feature = ln.Feature(name=\"biosample_name\", dtype=lab_a_type).save()\n    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert set(curator.cat._cat_vectors[\"biosample_name\"]._validated) == {\n        \"s3\",\n        \"s4\",\n        \"s5\",\n        \"s6\",\n    }\n    assert set(curator.cat._cat_vectors[\"biosample_name\"]._non_validated) == {\n        \"s1\",\n        \"s2\",\n    }\n\n    # Lab at the root\n    feature.delete(permanent=True)  # re-create the feature with the new dtype\n    feature = ln.Feature(name=\"biosample_name\", dtype=sample_root_type).save()\n    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert set(curator.cat._cat_vectors[\"biosample_name\"]._validated) == {\n        \"s1\",\n        \"s2\",\n    }\n    assert set(curator.cat._cat_vectors[\"biosample_name\"]._non_validated) == {\n        \"s3\",\n        \"s4\",\n        \"s5\",\n        \"s6\",\n    }\n\n    attribute = model_class.__name__.lower() + \"s\"\n    getattr(sample_a_type, attribute).all().delete(permanent=True)\n    getattr(sample_b_type, attribute).all().delete(permanent=True)\n    getattr(lab_b_type, attribute).all().delete(permanent=True)\n    getattr(lab_a_type, attribute).all().delete(permanent=True)\n    lab_a_type.delete(permanent=True)\n    lab_b_type.delete(permanent=True)\n    getattr(sample_root_type, attribute).all().delete(permanent=True)\n    sample_root_type.delete(permanent=True)\n    feature.delete(permanent=True)\n\n\ndef test_df_curator_same_name_at_different_levels_involving_root():\n    s1_root = ln.Record(name=\"s1\").save()\n    lab_a_type = ln.Record(name=\"LabA\", is_type=True).save()\n    s1_lab_a = ln.Record(name=\"s1\", type=lab_a_type).save()\n    df = pd.DataFrame({\"biosample_name\": pd.Categorical([\"s1\"])})\n\n    # feature constraining to lab_a_type\n    feature = ln.Feature(name=\"biosample_name\", dtype=lab_a_type).save()\n    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())\n    curator.validate()\n    cat_vector = curator._atomic_curator.cat._cat_vectors[\"biosample_name\"]\n    assert cat_vector._validated == [\"s1\"]\n    assert len(cat_vector.records) == 1\n    assert cat_vector.records[0] == s1_lab_a\n\n    # feature constraining to root\n    feature.delete(permanent=True)\n    feature = ln.Feature(name=\"biosample_name\", dtype=ln.Record).save()\n    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())\n    curator.validate()\n    cat_vector = curator._atomic_curator.cat._cat_vectors[\"biosample_name\"]\n    assert cat_vector._validated == [\"s1\"]\n    assert len(cat_vector.records) == 1\n    assert cat_vector.records[0] == s1_root\n\n    feature.delete(permanent=True)\n    s1_root.delete(permanent=True)\n    s1_lab_a.delete(permanent=True)\n    lab_a_type.delete(permanent=True)\n\n\ndef test_df_curator_same_name_at_different_levels_below_root():\n    department_a_type = ln.Record(name=\"DepartmentA\", is_type=True).save()\n    s1_department_a = ln.Record(name=\"s1\", type=department_a_type).save()\n    lab_a_type = ln.Record(name=\"LabA\", is_type=True, type=department_a_type).save()\n    s1_lab_a = ln.Record(name=\"s1\", type=lab_a_type).save()\n    df = pd.DataFrame({\"biosample_name\": pd.Categorical([\"s1\"])})\n\n    # feature constraining to lab_a_type\n    feature = ln.Feature(name=\"biosample_name\", dtype=lab_a_type).save()\n    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())\n    curator.validate()\n    cat_vector = curator._atomic_curator.cat._cat_vectors[\"biosample_name\"]\n    assert cat_vector._validated == [\"s1\"]\n    assert len(cat_vector.records) == 1\n    assert cat_vector.records[0] == s1_lab_a\n\n    # feature constraining to department_a_type\n    feature.delete(permanent=True)\n    feature = ln.Feature(name=\"biosample_name\", dtype=department_a_type).save()\n    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())\n    curator.validate()\n    cat_vector = curator._atomic_curator.cat._cat_vectors[\"biosample_name\"]\n    assert cat_vector._validated == [\"s1\"]\n    assert len(cat_vector.records) == 1\n    assert cat_vector.records[0] == s1_department_a\n\n    feature.delete(permanent=True)\n    s1_department_a.delete(permanent=True)\n    s1_lab_a.delete(permanent=True)\n    lab_a_type.delete(permanent=True)\n    department_a_type.delete(permanent=True)\n\n\ndef test_df_curator_same_name_at_same_level():\n    # below root level\n    lab_a_type = ln.Record(name=\"LabA\", is_type=True).save()\n    record_1 = ln.Record(name=\"s1\", type=lab_a_type).save()\n    lab_b_type = ln.Record(name=\"LabB\", is_type=True).save()\n    record_2 = ln.Record(name=\"s1\", type=lab_b_type).save()\n    df = pd.DataFrame({\"biosample_name\": pd.Categorical([\"s1\"])})\n    feature = ln.Feature(name=\"biosample_name\", dtype=ln.Record).save()\n    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert (\n        \"Ambiguous match for Record 's1': found 2 records at depth 1 (under types: ['LabA', 'LabB'])\"\n        in error.exconly()\n    )\n\n    # at root level\n    record_1.type = None\n    record_1.save()\n    record_2.type = None\n    record_2.save()\n    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert (\n        \"Ambiguous match for Record 's1': found 2 root-level records\" in error.exconly()\n    )\n\n    feature.delete(permanent=True)\n    record_1.delete(permanent=True)\n    lab_a_type.delete(permanent=True)\n    record_2.delete(permanent=True)\n    lab_b_type.delete(permanent=True)\n\n\n# also see test_features_name_duplicates_across_equal_levels\ndef test_curator_schema_feature_mapping():\n    lab_a_type = ln.Feature(name=\"LabA\", is_type=True).save()\n    feature1 = ln.Feature(name=\"sample_name\", dtype=\"str\", type=lab_a_type).save()\n    lab_b_type = ln.Feature(name=\"LabB\", is_type=True).save()\n    feature2 = ln.Feature(name=\"sample_name\", dtype=\"str\", type=lab_b_type).save()\n    schema = ln.Schema([feature1], name=\"Lab A schema\").save()\n    df = pd.DataFrame({\"sample_name\": [\"s1\", \"s2\"]})\n    curator = ln.curators.DataFrameCurator(df, schema)\n    curator.validate()\n    cat_vector = curator._atomic_curator.cat._cat_vectors[\"columns\"]\n    assert len(cat_vector.records) == 1\n    assert len(cat_vector._validated) == 1\n    schema.delete(permanent=True)\n    feature1.delete(permanent=True)\n    feature2.delete(permanent=True)\n    lab_a_type.delete(permanent=True)\n    lab_b_type.delete(permanent=True)\n\n\ndef test_dtypes_at_different_levels(ccaplog):\n    sample_type_root = ln.Record(name=\"Sample\", is_type=True).save()\n    lab_a_type = ln.Record(name=\"LabA\", is_type=True).save()\n    sample_type_a = ln.Record(name=\"Sample\", is_type=True, type=lab_a_type).save()\n    s1_lab_a = ln.Record(name=\"s1\", type=sample_type_a).save()\n    df = pd.DataFrame({\"biosample_name\": pd.Categorical([\"s1\"])})\n    feature = ln.Feature(name=\"biosample_name\", dtype=sample_type_root).save()\n    schema = ln.Schema(features=[feature]).save()\n    sample_type_root.delete()\n    df = pd.DataFrame({\"biosample_name\": pd.Categorical([\"s1\"])})\n    # UID-based lookup can find records in trash, so curator creation should succeed\n    # but a warning should be printed\n    curator = ln.curators.DataFrameCurator(df, schema)\n    assert \"from trash\" in ccaplog.text\n    schema.delete(permanent=True)\n    sample_type_root.restore()\n    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert \"1 term not validated in feature 'biosample_name': 's1'\" in error.exconly()\n    s1_root = ln.Record(name=\"s1\", type=sample_type_root).save()\n    curator.validate()\n    cat_vector = curator._atomic_curator.cat._cat_vectors[\"biosample_name\"]\n    assert cat_vector._validated == [\"s1\"]\n    assert len(cat_vector.records) == 1\n    assert cat_vector.records[0] == s1_root\n    # update feature dtype\n    feature.delete(permanent=True)\n    feature = ln.Feature(name=\"biosample_name\", dtype=sample_type_a).save()\n    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())\n    curator.validate()\n    cat_vector = curator._atomic_curator.cat._cat_vectors[\"biosample_name\"]\n    assert cat_vector._validated == [\"s1\"]\n    assert len(cat_vector.records) == 1\n    assert cat_vector.records[0] == s1_lab_a\n    feature.delete(permanent=True)\n    s1_lab_a.delete(permanent=True)\n    sample_type_a.delete(permanent=True)\n    lab_a_type.delete(permanent=True)\n    s1_root.delete(permanent=True)\n    sample_type_root.delete(permanent=True)\n\n\ndef test_nullable():\n    disease = ln.Feature(name=\"disease\", dtype=ln.ULabel, nullable=False).save()\n    schema = ln.Schema(features=[disease]).save()\n    dataset = {\"disease\": pd.Categorical([pd.NA, \"asthma\"])}\n    df = pd.DataFrame(dataset)\n    curator = ln.curators.DataFrameCurator(df, schema)\n    with pytest.raises(ln.errors.ValidationError) as err:\n        curator.validate()\n    assert \"non-nullable series 'disease' contains null values\" in err.exconly()\n    # make feature nullable\n    # (needs to throw an error if already datasets were validated with it)\n    disease.nullable = True\n    disease.save()\n    curator = ln.curators.DataFrameCurator(df, schema)\n    with pytest.raises(\n        ValidationError,\n        # match=re.escape(\"1 term is not validated: 'asthma'\"),  # TODO: need the message\n    ):\n        curator.validate()\n\n    schema.delete(permanent=True)\n    disease.delete(permanent=True)\n\n\ndef test_pandera_dataframe_schema(\n    df,\n    df_missing_sample_type_column,\n    df_changed_col_order,\n    df_extra_column,\n    df_missing_sample_name_column,\n):\n    # schemas\n    schema_all_required = ln.Schema(\n        name=\"my-schema all required\",\n        features=[\n            ln.Feature(name=\"sample_id\", dtype=str).save(),\n            ln.Feature(name=\"sample_name\", dtype=str).save(),\n            ln.Feature(name=\"sample_type\", dtype=str).save(),\n        ],\n    ).save()\n    schema_maximal_set = ln.Schema(\n        name=\"my-schema maximal_set\",\n        features=[\n            ln.Feature(name=\"sample_id\", dtype=str).save(),\n            ln.Feature(name=\"sample_name\", dtype=str).save(),\n            ln.Feature(name=\"sample_type\", dtype=str).save(),\n        ],\n        minimal_set=False,\n        maximal_set=True,\n    ).save()\n    schema_ordered_set = ln.Schema(\n        name=\"my-schema ordered_set\",\n        features=[\n            ln.Feature(name=\"sample_id\", dtype=str).save(),\n            ln.Feature(name=\"sample_name\", dtype=str).save(),\n            ln.Feature(name=\"sample_type\", dtype=str).save(),\n        ],\n        ordered_set=True,\n    ).save()\n\n    # minimal_set=True, all three columns are required\n    ln.curators.DataFrameCurator(df, schema=schema_all_required).validate()\n    # can't miss a required column\n    with pytest.raises(ValidationError):\n        ln.curators.DataFrameCurator(\n            df_missing_sample_type_column, schema=schema_all_required\n        ).validate()\n    # doesn't care about order\n    ln.curators.DataFrameCurator(\n        df_changed_col_order, schema=schema_all_required\n    ).validate()\n    # extra column is fine\n    ln.curators.DataFrameCurator(df_extra_column, schema=schema_all_required).validate()\n\n    # maximal_set=True, extra column is *not* allowed\n    # check that __lamindb values are OK\n    df[\"__lamindb_record_uid__\"] = \"some_value\"\n    ln.curators.DataFrameCurator(df, schema=schema_maximal_set).validate()\n    del df[\"__lamindb_record_uid__\"]\n    with pytest.raises(ValidationError):\n        ln.curators.DataFrameCurator(\n            df_extra_column,\n            schema=schema_maximal_set,  # extra column is not allowed\n        ).validate()\n    # minimal_set=False, missing column is allowed\n    ln.curators.DataFrameCurator(\n        df_missing_sample_type_column, schema=schema_maximal_set\n    ).validate()\n\n    # ordered_set=True, order matters\n    with pytest.raises(ValidationError):\n        ln.curators.DataFrameCurator(\n            df_changed_col_order, schema=schema_ordered_set\n        ).validate()\n\n    # a feature is optional\n    schema_optional_sample_name = ln.Schema(\n        name=\"my-schema optional sample_name\",\n        features=[\n            ln.Feature(name=\"sample_id\", dtype=str).save(),\n            ln.Feature(name=\"sample_name\", dtype=str).save().with_config(optional=True),\n            ln.Feature(name=\"sample_type\", dtype=str).save(),\n        ],\n    ).save()\n    # missing required \"sample_type\" column raises an error\n    with pytest.raises(ValidationError):\n        ln.curators.DataFrameCurator(\n            df_missing_sample_type_column,\n            schema=schema_optional_sample_name,\n        ).validate()\n    # missing optional column \"sample_name\" is fine\n    ln.curators.DataFrameCurator(\n        df_missing_sample_name_column, schema=schema_optional_sample_name\n    ).validate()\n\n    # clean up\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\ndef test_schema_not_saved(df):\n    \"\"\"Attempting to validate an unsaved Schema must error.\"\"\"\n    feature = ln.Feature(name=\"cell_type\", dtype=str).save()\n    schema = ln.Schema(features=[feature])\n\n    with pytest.raises(ValueError) as excinfo:\n        ln.curators.DataFrameCurator(df, schema)\n    assert excinfo.exconly() == (\n        \"ValueError: Schema must be saved before curation. Please save it using '.save()'.\"\n    )\n\n\ndef test_schema_artifact_annotated(df):\n    \"\"\"A passed Artifact should be annotated with a Schema if successfully curated.\"\"\"\n    af = ln.Artifact.from_dataframe(df, key=\"test.parquet\").save()\n    schema = ln.Schema(\n        name=\"sample schema\",\n        features=[ln.Feature(name=\"sample_id\", dtype=\"str\").save()],\n    ).save()\n    curator = ln.curators.DataFrameCurator(af, schema)\n    curator.validate()\n    curator.save_artifact()\n    af_queried = ln.Artifact.filter(key=\"test.parquet\").one()\n    assert af_queried.schema is not None\n\n    # clean up\n    af.delete(permanent=True)\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\ndef test_schema_optionals():\n    schema = ln.Schema(\n        name=\"my-schema\",\n        features=[\n            ln.Feature(name=\"sample_id\", dtype=str).save(),\n            ln.Feature(name=\"sample_name\", dtype=str).save().with_config(optional=True),\n            ln.Feature(name=\"sample_type\", dtype=str).save(),\n        ],\n    ).save()\n    assert schema.optionals.get().to_list(\"name\") == [\n        \"sample_name\",\n    ]\n\n    # set sample_type to optional\n    with pytest.raises(\n        TypeError,\n        match=re.escape(\"features must be a list of Feature records!\"),\n    ):\n        schema.optionals.set(\"test\")\n    schema.optionals.set([ln.Feature.get(name=\"sample_type\")])\n    assert schema.optionals.get().to_list(\"name\") == [\"sample_type\"]\n    # add sample_name to optionals\n    with pytest.raises(\n        TypeError,\n        match=re.escape(\"features must be a list of Feature records!\"),\n    ):\n        schema.optionals.add(\"test\")\n    schema.optionals.add(ln.Feature.get(name=\"sample_name\"))\n    assert schema.optionals.get().to_list(\"name\") == [\"sample_name\", \"sample_type\"]\n\n    # clean up\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\ndef test_schema_ordered_set(df):\n    # create features with a different order so that sample_id is not the first\n    ln.Feature(name=\"sample_name\", dtype=str).save()\n    ln.Feature(name=\"sample_type\", dtype=str).save()\n    ln.Feature(name=\"sample_id\", dtype=str).save()\n\n    # create an ordered schema with sample_id as the first feature\n    schema = ln.Schema(\n        name=\"my-schema\",\n        features=[\n            ln.Feature(name=\"sample_id\", dtype=str).save(),\n            ln.Feature(name=\"sample_name\", dtype=str).save(),\n            ln.Feature(name=\"sample_type\", dtype=str).save(),\n        ],\n        ordered_set=True,\n    ).save()\n\n    assert ln.curators.DataFrameCurator(df, schema=schema).validate() is None\n\n    # clean up\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\n@pytest.mark.parametrize(\"minimal_set\", [True, False])\ndef test_schema_minimal_set_var_allowed(minimal_set):\n    \"\"\"Independent of the value of minimal_set, invalid ensembl gene IDs are allowed.\"\"\"\n    adata = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\n    adata.var_names = [adata.var_names[0], adata.var_names[1], \"NOT_VALID_ENSEMBL\"]\n\n    var_schema = ln.Schema(\n        itype=bt.Gene.ensembl_gene_id,\n        minimal_set=minimal_set,\n    ).save()\n    schema = ln.Schema(otype=\"AnnData\", slots={\"var.T\": var_schema}).save()\n    curator = ln.curators.AnnDataCurator(adata, schema)\n    curator.validate()\n\n    # clean up\n    schema.delete(permanent=True)\n\n\ndef test_schema_maximal_set_var():\n    \"\"\"If maximal_set is True, invalid ensembl gene IDs are not allowed.\"\"\"\n    adata = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\n    adata.var_names = [adata.var_names[0], adata.var_names[1], \"NOT_VALID_ENSEMBL\"]\n\n    var_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save()\n    schema = ln.Schema(otype=\"AnnData\", slots={\"var.T\": var_schema}).save()\n\n    curator = ln.curators.AnnDataCurator(adata, schema)\n    with pytest.raises(ValidationError) as error:\n        curator.validate()\n    assert error.exconly() == (\n        \"lamindb.errors.ValidationError: 1 term not validated in feature 'columns' in slot 'var.T': 'NOT_VALID_ENSEMBL'\\n\"\n        \"    → fix typos, remove non-existent values, or save terms via: curator.slots['var.T'].cat.add_new_from('columns')\"\n    )\n\n    # clean up\n    schema.delete(permanent=True)\n\n\ndef test_feature_dtype_path():\n    df = pd.DataFrame(\n        {\n            \"sample\": [\"Sample_X\", \"Sample_Y\", \"Sample_Y\"],\n            \"fastq_1\": [\n                \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz\",\n                \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R1_001.fastq.gz\",\n                \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R1_001.fastq.gz\",\n            ],\n            \"fastq_2\": [\n                \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz\",\n                \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R2_001.fastq.gz\",\n                \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R2_001.fastq.gz\",\n            ],\n            \"expected_cells\": [5000, 5000, 5000],\n        }\n    )\n\n    nextflow_schema = ln.Schema(\n        name=\"nf-core/scrnaseq pipeline - params.input schema\",\n        description=\"https://github.com/nf-core/scrnaseq/blob/4.0.0/assets/schema_input.json\",\n        features=[\n            ln.Feature(\n                name=\"sample\",\n                dtype=\"str\",\n                nullable=False,\n                description=\"Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (_).\",\n            ).save(),\n            ln.Feature(\n                name=\"fastq_1\",\n                dtype=\"path\",\n                nullable=False,\n                description=\"Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension “.fastq.gz” or “.fq.gz”.\",\n            ).save(),\n            ln.Feature(\n                name=\"fastq_2\",\n                dtype=\"path\",\n                description=\"Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension “.fastq.gz” or “.fq.gz”.\",\n            ).save(),\n            ln.Feature(\n                name=\"expected_cells\",\n                dtype=int,\n                description=\"Number of cells expected for a sample. Must be an integer. If multiple rows are provided for the same sample, this must be the same number for all rows, i.e. the total number of expected cells for the sample.\",\n            ).save(),\n            ln.Feature(\n                name=\"seq_center\",\n                dtype=str,\n                description=\"Sequencing center for the sample. If multiple rows are provided for the same sample, this must be the same string for all rows. Samples sequenced at different centers are considered different samples and must have different identifiers.\",\n            ).save(),\n            ln.Feature(\n                name=\"sample_type\",\n                dtype=str,\n                description='\"atac\", \"gex\"',\n            ).save(),\n            ln.Feature(\n                name=\"feature_type\",\n                dtype=str,\n                description='\"gex\", \"vdj\", \"ab\", \"crispr\", \"cmo\"',\n            ).save(),\n        ],\n    ).save()\n\n    nextflow_schema.optionals.set(\n        [\n            ln.Feature.get(name=\"expected_cells\"),\n            ln.Feature.get(name=\"seq_center\"),\n            ln.Feature.get(name=\"sample_type\"),\n            ln.Feature.get(name=\"feature_type\"),\n        ]\n    )\n\n    curator = ln.curators.DataFrameCurator(df, schema=nextflow_schema)\n    assert curator.validate() is None\n\n    # clean up\n    nextflow_schema.delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\ndef test_cat_filters_specific_source_uid(df_disease, disease_ontology_old):\n    \"\"\"Specific source_uid passed to the `cat_filters`\"\"\"\n    feature = ln.Feature(\n        name=\"disease\",\n        dtype=bt.Disease,\n        cat_filters={\"source__uid\": disease_ontology_old.uid},\n    ).save()\n    schema = ln.Schema([feature], name=\"test schema\").save()\n    curator = ln.curators.DataFrameCurator(df_disease, schema)\n    try:\n        curator.validate()\n    except ln.errors.ValidationError as error:\n        assert (\n            \"2 terms not validated in feature 'disease': 'HDAC4-related haploinsufficiency syndrome', 'SAMD9L-related spectrum and myeloid neoplasm risk'\"\n            in str(error)\n        )\n    schema.delete(permanent=True)\n    feature.delete(permanent=True)\n\n\ndef test_cat_filters_specific_source(df_disease, disease_ontology_old):\n    \"\"\"Specific Source record passed to the `cat_filters`\"\"\"\n    feature = ln.Feature(\n        name=\"disease\",\n        dtype=bt.Disease,\n        cat_filters={\"source\": disease_ontology_old},\n    ).save()\n    schema = ln.Schema([feature], name=\"test schema\").save()\n    curator = ln.curators.DataFrameCurator(df_disease, schema)\n    try:\n        curator.validate()\n    except ln.errors.ValidationError as error:\n        assert (\n            \"2 terms not validated in feature 'disease': 'HDAC4-related haploinsufficiency syndrome', 'SAMD9L-related spectrum and myeloid neoplasm risk'\"\n            in str(error)\n        )\n\n    schema.delete(permanent=True)\n    feature.delete(permanent=True)\n\n\ndef test_cat_filters_multiple_relation_filters(df_disease, disease_ontology_old):\n    \"\"\"Multiple relation filters in cat_filters\"\"\"\n    # TODO: needs to also work if both filters are from the same related model!!!\n    feature = ln.Feature(\n        name=\"disease\",\n        dtype=bt.Disease,\n        cat_filters={\n            \"source__uid\": disease_ontology_old.uid,\n            \"created_by__handle\": ln.setup.settings.user.handle,\n        },\n    ).save()\n    schema = ln.Schema([feature], name=\"test schema\").save()\n    curator = ln.curators.DataFrameCurator(df_disease, schema)\n    try:\n        curator.validate()\n    except ln.errors.ValidationError as error:\n        assert (\n            \"2 terms not validated in feature 'disease': 'HDAC4-related haploinsufficiency syndrome', 'SAMD9L-related spectrum and myeloid neoplasm risk'\"\n            in str(error)\n        )\n    schema.delete(permanent=True)\n    feature.delete(permanent=True)\n\n\ndef test_curate_columns(df):\n    \"\"\"Test that columns can be curated.\"\"\"\n    schema = ln.Schema(\n        name=\"sample schema\",\n        features=[\n            ln.Feature(name=\"sample_id\", dtype=\"str\").save(),\n            ln.Feature(name=\"sample_name\", dtype=\"str\").save(),\n            ln.Feature(name=\"sample_type\", dtype=\"str\").save(),\n        ],\n    ).save()\n\n    # make one column name invalid\n    df.rename(columns={\"sample_name\": \"sample_name_name\"}, inplace=True)\n\n    curator = ln.curators.DataFrameCurator(df, schema)\n    try:\n        curator.validate()\n    except ln.errors.ValidationError as error:\n        assert \"column 'sample_name' not in dataframe\" in str(error)\n\n    # now fix the column\n    df.rename(columns={\"sample_name_name\": \"sample_name\"}, inplace=True)\n    curator.validate()\n\n    schema.delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\ndef test_wrong_datatype(df):\n    feature = ln.Feature(name=\"sample_id\", dtype=ln.ULabel).save()\n    schema = ln.Schema(features=[feature]).save()\n\n    curator = ln.curators.DataFrameCurator(df, schema)\n    with pytest.raises(ln.errors.ValidationError) as excinfo:\n        curator.validate()\n\n    assert \"expected series 'sample_id' to have type category, got object\" in str(\n        excinfo.value\n    )\n    assert (\n        \"Hint: Consider setting `feature.coerce = True` to attempt coercing values during validation to the required dtype.\"\n        in str(excinfo.value)\n    )\n\n    schema.delete(permanent=True)\n    feature.delete(permanent=True)\n\n\ndef test_hash_index_feature(df):\n    df_index = df.set_index(\"sample_id\")\n    sample_name = ln.Feature(name=\"sample_name\", dtype=\"str\").save()\n    sample_name.uid = \"OpQAD5Ifu89t\"\n    sample_name.save()\n    sample_type = ln.Feature(name=\"sample_type\", dtype=\"str\").save()\n    sample_type.uid = \"7I4u69RiCAVy\"\n    sample_type.save()\n    sample_id = ln.Feature(name=\"sample_id\", dtype=\"str\").save()\n    sample_id.uid = \"uValv1YfEQib\"\n    sample_id.save()\n    schema_index = ln.Schema(\n        name=\"sample schema with index\",\n        features=[\n            sample_name,\n            sample_type,\n        ],\n        index=sample_id,\n    ).save()\n    assert schema_index.hash == \"drtQMP4N4xEebS49DO-9Jw\"\n\n    schema = ln.Schema(\n        name=\"sample schema\",\n        features=[\n            sample_id,\n            sample_name,\n            sample_type,\n        ],\n    ).save()\n    assert schema.hash == \"Z_dmk1WendD15s2FyBW1HA\"\n\n    artifact = ln.Artifact.from_dataframe(\n        df_index, key=\"curated_df.parquet\", schema=schema_index\n    ).save()\n    assert artifact.schemas.all().one() == schema_index\n\n    # clean up\n    artifact.delete(permanent=True)\n    schema_index.delete(permanent=True)\n    schema.delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\ndef test_add_new_from_subtype(df):\n    \"\"\"Test that add_new_from works with subtypes.\"\"\"\n    sample_type = ln.Record(name=\"SampleType\", is_type=True).save()\n    ln.Record(name=\"Type A\", type=sample_type).save()\n    schema = ln.Schema(\n        name=\"sample schema\",\n        features=[\n            ln.Feature(name=\"sample_id\", dtype=\"str\").save(),\n            ln.Feature(name=\"sample_name\", dtype=\"str\").save(),\n            ln.Feature(name=\"sample_type\", dtype=sample_type).save(),\n        ],\n        coerce=True,\n    ).save()\n\n    curator = ln.curators.DataFrameCurator(df, schema)\n    try:\n        curator.validate()\n    except ln.errors.ValidationError as error:\n        assert \"1 term not validated in feature 'sample_type': 'Type B'\" in str(error)\n\n    # add new from subtype\n    curator.cat.non_validated[\"sample_type\"]\n    curator.cat.add_new_from(\"sample_type\")\n    curator.validate()\n    assert sample_type.records.to_list(\"name\") == [\"Type A\", \"Type B\"]\n\n    # clean up\n    schema.delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n    ln.Record.filter().update(type=None)\n    ln.Record.filter().delete(permanent=True)\n\n\ndef test_index_feature_exclusion_from_categoricals(df):\n    df_indexed = df.set_index(\"sample_id\")\n\n    sample_type_feature = ln.Feature(name=\"sample_type\", dtype=\"cat[ULabel]\").save()\n    sample_id_feature = ln.Feature(name=\"sample_id\", dtype=\"cat[ULabel]\").save()\n\n    # schema with sample_id as index (not in features)\n    schema = ln.Schema(features=[sample_type_feature], index=sample_id_feature).save()\n\n    curator = ln.curators.DataFrameCurator(df_indexed, schema)\n\n    # Verify that only sample_type is in categoricals, not sample_id (index)\n    categoricals_names = [\n        f.name for f in curator._atomic_curator._cat_manager._categoricals\n    ]\n    assert \"sample_type\" in categoricals_names\n    assert \"sample_id\" not in categoricals_names\n\n    # Verify the cat_vectors do not include the index feature\n    cat_vector_keys = list(curator.cat._cat_vectors.keys())\n    assert \"sample_type\" in cat_vector_keys\n    assert \"sample_id\" not in cat_vector_keys\n    assert \"columns\" in cat_vector_keys\n\n    # clean up\n    schema.delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_data_migrations.py",
    "content": "\"\"\"Tests for PostgreSQL data migrations.\"\"\"\n\nimport os\n\nimport lamindb as ln\nimport pytest\n\n\n@pytest.mark.skipif(\n    os.getenv(\"LAMINDB_TEST_DB_VENDOR\") != \"postgresql\",\n    reason=\"PostgreSQL-specific migration test\",\n)\ndef test_migrate_auxiliary_fields_postgres():\n    \"\"\"Test PostgreSQL migration of auxiliary fields for models.\n\n    This test verifies that migrate_auxiliary_fields_postgres correctly migrates:\n\n    **Artifact:**\n    - _save_completed from _aux['af']['0']\n\n    **Run:**\n    - cli_args from _aux['af']['0']\n\n    **Feature:**\n    - default_value from _aux['af']['0']\n    - nullable from _aux['af']['1'] (default: True)\n    - coerce from _aux['af']['2'] (default: False)\n    - For type features, all values are set to NULL\n\n    **Schema:**\n    - coerce from _aux['af']['0']\n    - flexible from _aux['af']['2'] (or computes from n_members)\n    - Converts negative n_members to NULL\n    - For type schemas, all values are set to NULL\n    - Preserves '1' (optionals) and '3' (index_feature_uid) in _aux\n    \"\"\"\n    from django.db import connection\n    from lamindb.models.schema import migrate_auxiliary_fields_postgres\n\n    # === Setup test data ===\n\n    # Create a Transform and Run for testing\n    transform = ln.Transform(key=\"test_migration_transform\").save()\n    run = ln.Run(transform=transform).save()\n\n    # Create an Artifact for testing\n    artifact = ln.Artifact(\".gitignore\", key=\"test_migration_artifact\").save()\n\n    # Create Features for testing (type and regular)\n    type_feature = ln.Feature(\n        name=\"TestMigrationTypeFeat\", dtype=str, is_type=True\n    ).save()\n    regular_feature = ln.Feature(name=\"test_migration_regular_feat\", dtype=str).save()\n\n    # Create Schemas for testing (type and regular)\n    type_schema = ln.Schema(name=\"TestMigrationTypeSchema\", is_type=True).save()\n    feature_for_schema1 = ln.Feature(\n        name=\"test_migration_schema_feat1\", dtype=str\n    ).save()\n    feature_for_schema2 = ln.Feature(\n        name=\"test_migration_schema_feat2\", dtype=str\n    ).save()\n    regular_schema = ln.Schema(\n        name=\"TestMigrationRegularSchema\",\n        features=[feature_for_schema1, feature_for_schema2],\n        coerce=True,\n        flexible=True,\n    ).save()\n\n    # === Add _save_completed column temporarily (removed in migration 0173) ===\n    with connection.cursor() as cursor:\n        cursor.execute(\n            \"\"\"\n            DO $$\n            BEGIN\n                IF NOT EXISTS (\n                    SELECT 1 FROM information_schema.columns\n                    WHERE table_name = 'lamindb_artifact' AND column_name = '_save_completed'\n                ) THEN\n                    ALTER TABLE lamindb_artifact ADD COLUMN _save_completed BOOLEAN;\n                END IF;\n            END $$;\n            \"\"\"\n        )\n\n    # === Set old-style _aux data to simulate pre-migration state ===\n    with connection.cursor() as cursor:\n        # Artifact: set _aux with af containing _save_completed value\n        cursor.execute(\n            \"\"\"\n            UPDATE lamindb_artifact\n            SET _aux = '{\"af\": {\"0\": true}}'::jsonb,\n                _save_completed = NULL\n            WHERE id = %s\n            \"\"\",\n            [artifact.id],\n        )\n\n        # Run: set _aux with af containing cli_args value\n        cursor.execute(\n            \"\"\"\n            UPDATE lamindb_run\n            SET _aux = '{\"af\": {\"0\": \"--verbose --debug\"}}'::jsonb,\n                cli_args = NULL\n            WHERE id = %s\n            \"\"\",\n            [run.id],\n        )\n\n        # Feature (type): set _aux with af keys that should result in NULL values\n        cursor.execute(\n            \"\"\"\n            UPDATE lamindb_feature\n            SET _aux = '{\"af\": {\"0\": \"default_val\", \"1\": false, \"2\": true}}'::jsonb,\n                default_value = NULL,\n                nullable = NULL,\n                coerce = NULL\n            WHERE id = %s\n            \"\"\",\n            [type_feature.id],\n        )\n\n        # Feature (regular): set _aux with af keys for migration\n        cursor.execute(\n            \"\"\"\n            UPDATE lamindb_feature\n            SET _aux = '{\"af\": {\"0\": \"my_default\", \"1\": false, \"2\": true}}'::jsonb,\n                default_value = NULL,\n                nullable = NULL,\n                coerce = NULL\n            WHERE id = %s\n            \"\"\",\n            [regular_feature.id],\n        )\n\n        # Schema (type): set _aux with af keys that should be cleaned\n        cursor.execute(\n            \"\"\"\n            UPDATE lamindb_schema\n            SET _aux = '{\"af\": {\"0\": true, \"2\": false}}'::jsonb,\n                coerce = NULL,\n                flexible = NULL\n            WHERE id = %s\n            \"\"\",\n            [type_schema.id],\n        )\n\n        # Schema (regular): set _aux with af keys including optionals (key \"1\")\n        cursor.execute(\n            \"\"\"\n            UPDATE lamindb_schema\n            SET _aux = '{\"af\": {\"0\": true, \"1\": [\"uid1\", \"uid2\"], \"2\": true}}'::jsonb,\n                coerce = NULL,\n                flexible = NULL\n            WHERE id = %s\n            \"\"\",\n            [regular_schema.id],\n        )\n\n    # === Run the migration function ===\n    with connection.schema_editor() as schema_editor:\n        migrate_auxiliary_fields_postgres(schema_editor)\n\n    # === Refresh all objects from database ===\n    run.refresh_from_db()\n    type_feature.refresh_from_db()\n    regular_feature.refresh_from_db()\n    type_schema.refresh_from_db()\n    regular_schema.refresh_from_db()\n\n    # === Verify Artifact migration ===\n    with connection.cursor() as cursor:\n        cursor.execute(\n            \"SELECT _save_completed, _aux FROM lamindb_artifact WHERE id = %s\",\n            [artifact.id],\n        )\n        row = cursor.fetchone()\n        assert row[0] is True  # _save_completed from _aux['af']['0']\n        # _aux should have 'af' removed (was only key)\n        assert row[1] is None or \"af\" not in (\n            row[1] if isinstance(row[1], dict) else {}\n        )\n\n    # === Verify Run migration ===\n    assert run.cli_args == \"--verbose --debug\"  # from _aux['af']['0']\n    # _aux should have 'af' removed\n    assert run._aux is None or \"af\" not in run._aux\n\n    # === Verify Feature (type) migration ===\n    # Type features should have all values set to NULL\n    assert type_feature.default_value is None\n    assert type_feature.nullable is None\n    assert type_feature.coerce is None\n    # _aux should have 'af' removed\n    assert type_feature._aux is None or \"af\" not in type_feature._aux\n\n    # === Verify Feature (regular) migration ===\n    assert regular_feature.default_value == \"my_default\"  # from _aux['af']['0']\n    assert regular_feature.nullable is False  # from _aux['af']['1']\n    assert regular_feature.coerce is True  # from _aux['af']['2']\n    # _aux should have 'af' removed\n    assert regular_feature._aux is None or \"af\" not in regular_feature._aux\n\n    # === Verify Schema (type) migration ===\n    assert type_schema.coerce is None\n    assert type_schema.flexible is None\n    assert type_schema.n_members is None\n    # _aux should either be None or not have '0' and '2' keys in 'af'\n    if type_schema._aux is not None and \"af\" in type_schema._aux:\n        assert \"0\" not in type_schema._aux[\"af\"]\n        assert \"2\" not in type_schema._aux[\"af\"]\n\n    # === Verify Schema (regular) migration ===\n    assert regular_schema.coerce is True  # from _aux['af']['0']\n    assert regular_schema.flexible is True  # from _aux['af']['2']\n    # _aux should preserve key '1' (optionals)\n    assert regular_schema._aux is not None\n    assert \"af\" in regular_schema._aux\n    assert \"1\" in regular_schema._aux[\"af\"]\n    assert regular_schema._aux[\"af\"][\"1\"] == [\"uid1\", \"uid2\"]\n    # Keys '0' and '2' should be removed\n    assert \"0\" not in regular_schema._aux[\"af\"]\n    assert \"2\" not in regular_schema._aux[\"af\"]\n\n    # === Clean up: remove temporary column and delete records ===\n    with connection.cursor() as cursor:\n        cursor.execute(\n            \"\"\"\n            DO $$\n            BEGIN\n                IF EXISTS (\n                    SELECT 1 FROM information_schema.columns\n                    WHERE table_name = 'lamindb_artifact' AND column_name = '_save_completed'\n                ) THEN\n                    ALTER TABLE lamindb_artifact DROP COLUMN _save_completed;\n                END IF;\n            END $$;\n            \"\"\"\n        )\n\n    regular_schema.delete(permanent=True)\n    type_schema.delete(permanent=True)\n    feature_for_schema1.delete(permanent=True)\n    feature_for_schema2.delete(permanent=True)\n    regular_feature.delete(permanent=True)\n    type_feature.delete(permanent=True)\n    artifact.delete(permanent=True)\n    run.delete(permanent=True)\n    transform.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_db.py",
    "content": "import lamindb as ln\n\n\ndef test_create_to_load():\n    transform = ln.Transform(version=\"0\", key=\"test\", kind=\"pipeline\")\n    transform.save()\n    run = ln.Run(transform=transform)\n    run.save()\n    ln.Storage.get(root=str(ln.setup.settings.storage.root))\n"
  },
  {
    "path": "tests/core/test_delete.py",
    "content": "import bionty as bt\nimport lamindb as ln\nimport pytest\n\n\n@pytest.mark.parametrize(\"permanent\", [True, False])\ndef test_delete_qs(permanent):\n    \"\"\"Test deletion behavior for small (1) and large (>=2) querysets.\n\n    Small querysets delete individually, large ones trigger bulk delete.\"\"\"\n    ln.settings.creation.search_names = False\n    labels = [ln.Record(name=f\"label_{i}\") for i in range(3)]\n    ln.settings.creation.search_names = True\n    ln.save(labels)\n    ln.Record.filter(name__startswith=\"label_\").delete(permanent=permanent)\n    assert ln.Record.filter(name__startswith=\"label_\", branch_id=-1).count() == (\n        0 if permanent else 3\n    )\n    assert ln.ULabel.filter(name__startswith=\"label_\").count() == 0\n\n\ndef test_recreate_soft_deleted_record():\n    # testing soft delete and recreate with postgres (sqlite is tested in curators/test_records.py)\n    # soft delete a record, then recreate it with some changes\n    record = bt.Ethnicity.from_source(ontology_id=\"HANCESTRO:0006\").save()\n    assert record.branch_id == 1\n    record.delete()\n    assert record.branch_id == -1\n    # now recreate the same record from ontology_id with a different description\n    # there's a unique constraint on ontology_id, so this should recover the trashed record\n    record = bt.Ethnicity.from_source(ontology_id=\"HANCESTRO:0006\")\n    record.description = \"new description\"\n    record.save()\n    # now this record is recovered from the trash with the new description\n    assert record.branch_id == 1\n    assert record.description == \"new description\"\n    bt.Ethnicity.objects.filter().delete()\n"
  },
  {
    "path": "tests/core/test_feature.py",
    "content": "import bionty as bt\nimport lamindb as ln\nimport pandas as pd\nimport pytest\nfrom lamindb.errors import ValidationError\nfrom lamindb.models.feature import serialize_pandas_dtype\nfrom pandas.api.types import is_string_dtype\n\n\n@pytest.fixture(scope=\"module\")\ndef dict_data():\n    return {\n        \"dict_feat1\": 42,\n        \"dict_feat2\": 3.14,\n        \"dict_feat3\": \"somestring\",  # string (ambiguous cat ? str)\n        \"dict_feat4\": True,\n        \"dict_feat5\": [1, 2, 3],\n        \"dict_feat6\": [\"a\", \"b\", \"c\"],  # list[str] (ambiguous list[cat ? str])\n        \"dict_feat7\": {\"key\": \"value\"},\n    }\n\n\ndef test_feature_init():\n    # positional args not supported\n    with pytest.raises(ValueError):\n        ln.Feature(\"x\")\n\n    # dtype required unless is_type=True\n    with pytest.raises(ValidationError):\n        ln.Feature(name=\"feat\")\n\n    # is OK if also is_type is passed\n    ln.Feature(name=\"Feat\", is_type=True)\n\n    # invalid dtype string\n    with pytest.raises(ValueError):\n        ln.Feature(name=\"feat\", dtype=\"x\")\n\n    # categorical dtype must specify valid types\n    with pytest.raises(ValidationError):\n        ln.Feature(name=\"feat\", dtype=\"cat[1]\")\n\n    # ensure feat1 does not exist\n    if feat1 := ln.Feature.filter(name=\"feat1\").one_or_none() is not None:\n        feat1.delete(permanent=True)\n\n    feat1 = ln.Feature(name=\"feat\", dtype=\"str\").save()\n    # duplicate name with different dtype should fail\n    with pytest.raises(ValidationError) as error:\n        ln.Feature(name=\"feat\", dtype=ln.ULabel)\n    assert (\n        error.exconly()\n        == \"lamindb.errors.ValidationError: Feature feat already exists with dtype str, you passed cat[ULabel]\"\n    )\n    feat1.delete(permanent=True)\n\n    # string and list syntax for categorical dtypes should be equivalent and work\n    feat2 = ln.Feature(name=\"feat2\", dtype=\"str\", description=\"feat2\").save()\n    feat2_again = ln.Feature(name=\"feat2\", dtype=\"str\", description=\"feat2\").save()\n    assert feat2 == feat2_again\n    feat2.delete(permanent=True)\n\n    # categorical dtype with union of registries using string syntax must be valid\n    feature = ln.Feature(name=\"feat1\", dtype=\"cat[Record|bionty.Gene]\")\n    assert feature._dtype_str == \"cat[Record|bionty.Gene]\"\n    # categorical dtype with union of registries using objects must be valid\n    feature = ln.Feature(name=\"feat1\", dtype=[ln.Record, bt.Gene])\n    assert feature._dtype_str == \"cat[Record|bionty.Gene]\"\n\n    # dtype with field name before bracket filters must be valid\n    feature = ln.Feature(\n        name=\"gene_feature\", dtype=\"cat[bionty.Gene.ensembl_gene_id[organism='human']]\"\n    )\n    print(feature._dtype_str)\n    assert \"bionty.Gene\" in feature._dtype_str\n    assert \"ensembl_gene_id\" in feature._dtype_str\n    assert \"organism='human'\" in feature._dtype_str\n\n\n# @pytest.mark.skipif(\n#     os.getenv(\"LAMINDB_TEST_DB_VENDOR\") == \"sqlite\", reason=\"Postgres-only\"\n# )\n# def test_cannot_mutate_dtype():\n#     feature = ln.Feature(name=\"feature\", dtype=str).save()\n#     feature._dtype_str = int\n#     with pytest.raises(django.db.utils.IntegrityError) as error:\n#         feature.save()\n#     assert \"dtype field is immutable and cannot be changed\" in error.exconly()\n#     feature.delete(permanent=True)\n\n\n# def test_cat_filters_dtype():\n#     feature = ln.Feature(\n#         name=\"disease\",\n#         dtype=bt.Disease,\n#         cat_filters={\n#             \"source__uid\": \"4a3ejKuf\"\n#         },  # uid corresponds to disease_ontology_old.uid\n#     ).save()\n\n#     assert feature._dtype_str == \"cat[bionty.Disease[source__uid='4a3ejKuf']]\"\n\n#     feature.delete(permanent=True)\n\n\ndef test_cat_filters_empty_filter():\n    # empty filter values should be rejected\n    with pytest.raises(ValidationError) as error:\n        ln.Feature(name=\"feat_empty\", dtype=bt.Disease, cat_filters={\"source__uid\": \"\"})\n    assert (\n        \"lamindb.errors.ValidationError: Empty value in filter source__uid\"\n        in error.exconly()\n    )\n\n\ndef test_cat_filters_invalid_field_name():\n    # invalid filter field names should be rejected\n    source = bt.Source(\n        name=\"\", description=\"\", organism=\"\", entity=\"\", version=\"\"\n    ).save()\n    with pytest.raises(ValidationError) as error:\n        ln.Feature(\n            name=\"feat_invalid_attr\",\n            dtype=bt.Disease,\n            cat_filters={\"source__invalid_field\": source},\n        )\n    assert (\n        \"lamindb.errors.ValidationError: SQLRecord Source has no attribute 'invalid_field' in filter source__invalid_field\"\n        in error.exconly()\n    )\n    source.delete(permanent=True)\n\n\ndef test_cat_filters_artifact_schema_filter():\n    schema_feature = ln.Feature(name=\"schema_filter_column\", dtype=str).save()\n    schema = ln.Schema(name=\"schema_filter_schema\", features=[schema_feature]).save()\n    try:\n        feature = ln.Feature(\n            name=\"artifact_input\",\n            dtype=ln.Artifact,\n            cat_filters={\"schema\": schema},\n        )\n        assert feature._dtype_str == f\"cat[Artifact[schema__uid='{schema.uid}']]\"\n    finally:\n        schema.delete(permanent=True)\n        schema_feature.delete(permanent=True)\n\n\ndef test_feature_from_df():\n    df = pd.DataFrame(\n        {\n            \"feat1\": [1, 2, 3],\n            \"feat2\": [3.1, 4.2, 5.3],\n            \"feat3\": pd.Categorical([\"cond1\", \"cond2\", \"cond2\"]),\n            \"feat4\": [\"id1\", \"id2\", \"id3\"],\n            \"rando_feature\": [\"rando1\", \"rando2\", \"rando3\"],\n        }\n    )\n    if feat1 := ln.Feature.filter(name=\"feat1\").one_or_none() is not None:\n        feat1.delete(permanent=True)\n    features = ln.Feature.from_dataframe(df.iloc[:, :4]).save()\n    artifact = ln.Artifact.from_dataframe(df, description=\"test\").save()\n    # test for deprecated add_feature_set\n    schema = ln.Schema(features).save()\n    artifact.features._add_schema(schema, slot=\"columns\")\n    features = artifact.features.slots[\"columns\"].features.all()\n    assert len(features) == len(df.columns[:4])\n    [col for col in df.columns if is_string_dtype(df[col])]\n    categoricals = {\n        col: df[col] for col in df.columns if isinstance(df[col], pd.CategoricalDtype)\n    }\n    for feature in features:\n        if feature.name in categoricals:\n            assert feature._dtype_str == \"cat\"\n        else:\n            orig_type = df[feature.name].dtype\n            assert feature._dtype_str == serialize_pandas_dtype(orig_type)\n    for feature in features:\n        feature.save()\n    labels = [ln.Record(name=name) for name in df[\"feat3\"].unique()]\n    ln.save(labels)\n    feature = ln.Feature.get(name=\"feat3\")\n    with pytest.raises(ValidationError) as err:\n        artifact.labels.add(labels, feature=feature)\n    assert (\n        err.exconly()\n        == \"lamindb.errors.ValidationError: Cannot manually annotate a feature measured *within* the dataset. Please use a Curator.\"\n    )\n    extfeature = ln.Feature(name=\"extfeat\", dtype=\"str\").save()\n    with pytest.raises(ValidationError) as err:\n        artifact.labels.add(labels, feature=extfeature)\n    assert (\n        err.exconly()\n        == f\"lamindb.errors.ValidationError: Feature {extfeature.name} needs dtype='cat' for label annotation, currently has dtype='str'\"\n    )\n\n    # clean up\n    artifact.delete(permanent=True)\n    ln.Schema.filter().delete(permanent=True)\n    ln.Record.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\ndef test_feature_from_dict(dict_data):\n    # defaults to str for ambiguous types\n    features = ln.Feature.from_dict(dict_data)\n    assert len(features) == len(dict_data)\n    assert features[0]._dtype_str == \"int\"\n    assert features[1]._dtype_str == \"float\"\n    assert features[2]._dtype_str == \"str\"\n    assert features[3]._dtype_str == \"bool\"\n    assert features[4]._dtype_str == \"list[int]\"\n    assert features[5]._dtype_str == \"list[str]\"\n    assert features[6]._dtype_str == \"dict\"\n\n    # Wrong field\n    with pytest.raises(ValueError) as e:\n        ln.Feature.from_dict(dict_data, field=ln.Record.name)\n    assert \"field must be a Feature FieldAttr\" in str(e.value)\n\n    # Explicit field\n    features_with_field = ln.Feature.from_dict(dict_data, field=ln.Feature.name)\n    assert len(features_with_field) == len(dict_data)\n\n\ndef test_feature_from_dict_type(dict_data):\n    feature_type = ln.Feature(name=\"Testdata_feature_type\", is_type=True).save()\n    features = ln.Feature.from_dict(dict_data, type=feature_type).save()\n    for feature in features:\n        assert feature.type.name == \"Testdata_feature_type\"\n    ln.Feature.filter(type__isnull=False).delete(permanent=True)\n    feature_type.delete(permanent=True)\n\n\ndef test_feature_query_by_dtype():\n    \"\"\"Test querying Feature by dtype (deprecated) and _dtype_str.\"\"\"\n    str_feat = ln.Feature(name=\"test_str_feat\", dtype=str).save()\n    int_feat = ln.Feature(name=\"test_int_feat\", dtype=int).save()\n    try:\n        # Test querying by _dtype_str (current way)\n        str_features = ln.Feature.filter(_dtype_str=\"str\", name=\"test_str_feat\")\n        assert str_features.count() == 1\n        assert str_features.first() == str_feat\n\n        str_features = ln.Feature.filter(dtype_as_str=\"str\", name=\"test_str_feat\")\n        assert str_features.count() == 1\n        assert str_features.first() == str_feat\n\n        # Test querying by dtype (deprecated) - should work but issue warning\n        with pytest.warns(\n            DeprecationWarning,\n            match=\"Querying Feature by `dtype` is deprecated.*Notice the new dtype encoding format\",\n        ):\n            str_features_deprecated = ln.Feature.filter(\n                dtype=\"str\", name=\"test_str_feat\"\n            )\n            assert str_features_deprecated.count() == 1\n            assert str_features_deprecated.first() == str_feat\n    finally:\n        # Clean up\n        str_feat.delete(permanent=True)\n        int_feat.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_feature_dtype.py",
    "content": "import datetime\n\nimport bionty as bt\nimport lamindb as ln\nimport pandas as pd\nimport pytest\nfrom lamindb import Record\nfrom lamindb.errors import ValidationError\nfrom lamindb.models.feature import (\n    dtype_as_object,\n    parse_dtype,\n    parse_filter_string,\n    resolve_relation_filters,\n    serialize_dtype,\n)\n\n\n@pytest.fixture\ndef organism():\n    organism = bt.Organism(name=\"test_organism\")\n    organism.uid = \"testuid2\"\n    organism.save()\n    return organism\n\n\n# -----------------------------------------------------------------------------\n# serializing dtypes\n# -----------------------------------------------------------------------------\n\n\ndef test_serialize_basic_dtypes():\n    assert serialize_dtype(int) == \"int\"\n    assert serialize_dtype(float) == \"float\"\n    assert serialize_dtype(str) == \"str\"\n    assert serialize_dtype(bool) == \"bool\"\n    assert serialize_dtype(dict) == \"dict\"\n    # assert serialize_dtype(bytes) == \"bytes\"  # not yet supported\n    assert serialize_dtype(datetime.datetime) == \"datetime\"\n    assert serialize_dtype(datetime.date) == \"date\"\n\n\ndef test_serialize_basic_list_dtypes():\n    assert serialize_dtype(list[int]) == \"list[int]\"\n    assert serialize_dtype(list[float]) == \"list[float]\"\n    assert serialize_dtype(list[str]) == \"list[str]\"\n    assert serialize_dtype(list[bool]) == \"list[bool]\"\n    assert serialize_dtype(list[dict]) == \"list[dict]\"\n    assert serialize_dtype(list[datetime.datetime]) == \"list[datetime]\"\n    assert serialize_dtype(list[datetime.date]) == \"list[date]\"\n\n\ndef test_seralize_pandas_numpy_dtypes():\n    series = pd.Series([1, 4, 0, 10, 9], dtype=\"uint\")\n    assert series.dtype.name == \"uint64\"\n    assert serialize_dtype(series.dtype) == \"int\"\n\n\ndef test_serialize_user(ccaplog):\n    # correct way through Python object and serialize_dtype()\n    feature = ln.Feature(name=\"user_feat\", dtype=ln.User)\n    assert feature._dtype_str == \"cat[User]\"\n    # legacy way through parse_dtype()\n    feature = ln.Feature(name=\"user_feat\", dtype=\"cat[User]\")\n    assert (\n        \"rather than passing a string 'cat[User]' to dtype, consider passing a Python object\"\n        in ccaplog.text\n    )\n    assert feature._dtype_str == \"cat[User]\"\n\n\ndef test_serialize_record_objects():\n    insitute_type = ln.Record(name=\"InstituteA\", is_type=True)\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        serialize_dtype(insitute_type)\n    assert (\n        f\"Cannot serialize unsaved objects. Save {insitute_type} via `.save()`.\"\n        in error.exconly()\n    )\n    insitute_type.save()\n    lab_type = ln.Record(name=\"LabB\", type=insitute_type, is_type=True).save()\n    sample_type = ln.Record(name=\"Sample\", type=lab_type, is_type=True).save()\n    # New UID-based format: cat[Record[uid]] instead of cat[Record[Parent[Child]]]\n    serialized_str = f\"cat[Record[{sample_type.uid}]]\"\n    feature = ln.Feature(name=\"sample_feature\", dtype=sample_type).save()\n    assert feature._dtype_str == serialized_str\n    assert feature.dtype == \"cat[Record[InstituteA[LabB[Sample]]]]\"\n    feature.delete(permanent=True)\n    assert serialize_dtype(sample_type) == serialized_str\n    with pytest.raises(ln.errors.IntegrityError) as error:\n        parse_dtype(\"cat[Record[Sample]]\", check_exists=True, old_format=True)\n    assert (\n        \"No Record type found matching subtypes ['Sample'] for field `.name`\"\n        in error.exconly()\n    )\n    sample = ln.Record(name=\"sample\").save()\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        parse_dtype(f\"cat[Record[{sample.uid}]]\", check_exists=True)\n    assert (\n        f\"The resolved Record 'sample' (uid='{sample.uid}') is not a type: is_type is False.\"\n        in error.exconly()\n    )\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        serialize_dtype(sample)\n    assert (\n        \"Cannot serialize non-type Record 'sample'. Only types (is_type=True) are allowed in dtypes.\"\n        in error.exconly()\n    )\n    sample_type.delete(permanent=True)\n    lab_type.delete(permanent=True)\n    insitute_type.delete(permanent=True)\n    sample.delete(permanent=True)\n\n\ndef test_serialize_union_of_registries():\n    serialized_str = \"cat[Record|bionty.Gene]\"\n    assert serialize_dtype([ln.Record, bt.Gene]) == serialized_str\n    serialized_str = \"cat[bionty.CellType|bionty.CellLine]\"\n    assert serialize_dtype([bt.CellType, bt.CellLine]) == serialized_str\n\n\ndef test_serialize_with_field_information():\n    serialized_str = \"cat[bionty.Gene.ensembl_gene_id]\"\n    assert serialize_dtype(bt.Gene.ensembl_gene_id) == serialized_str\n    serialized_str = \"cat[bionty.CellType.uid|bionty.CellLine.uid]\"\n    assert serialize_dtype([bt.CellType.uid, bt.CellLine.uid]) == serialized_str\n\n\n# -----------------------------------------------------------------------------\n# parsing serialized dtypes\n# -----------------------------------------------------------------------------\n\n\ndef test_simple_record_with_subtype_and_field():\n    # Create a Record type to get its UID\n    customer_type = ln.Record(name=\"Customer\", is_type=True).save()\n    dtype_str = f\"cat[Record[{customer_type.uid}].name]\"\n    result = parse_dtype(dtype_str)\n    assert len(result) == 1\n    assert result[0] == {\n        \"registry_str\": \"Record\",\n        \"filter_str\": \"\",\n        \"field_str\": \"name\",\n        \"registry\": Record,\n        \"field\": Record.name,\n        \"record_uid\": customer_type.uid,\n    }\n    customer_type.delete(permanent=True)\n\n\ndef test_multiple_records_with_subtypes_and_fields():\n    # Create Record types to get their UIDs\n    customer_type = ln.Record(name=\"Customer\", is_type=True).save()\n    supplier_type = ln.Record(name=\"Supplier\", is_type=True).save()\n    dtype_str = (\n        f\"cat[Record[{customer_type.uid}].name|Record[{supplier_type.uid}].name]\"\n    )\n    result = parse_dtype(dtype_str)\n    assert len(result) == 2\n    assert result[0] == {\n        \"registry_str\": \"Record\",\n        \"filter_str\": \"\",\n        \"field_str\": \"name\",\n        \"registry\": Record,\n        \"field\": Record.name,\n        \"record_uid\": customer_type.uid,\n    }\n    assert result[1] == {\n        \"registry_str\": \"Record\",\n        \"filter_str\": \"\",\n        \"field_str\": \"name\",\n        \"registry\": Record,\n        \"field\": Record.name,\n        \"record_uid\": supplier_type.uid,\n    }\n    customer_type.delete(permanent=True)\n    supplier_type.delete(permanent=True)\n\n\ndef test_bionty_celltype_with_field():\n    dtype_str = \"cat[bionty.CellType.ontology_id]\"\n    result = parse_dtype(dtype_str)\n    assert len(result) == 1\n    assert result[0] == {\n        \"registry_str\": \"bionty.CellType\",\n        \"filter_str\": \"\",\n        \"field_str\": \"ontology_id\",\n        \"registry\": bt.CellType,\n        \"field\": bt.CellType.ontology_id,\n    }\n\n\ndef test_bionty_perturbations_with_field():\n    dtype_str = \"cat[bionty.CellType.uid|bionty.CellLine.uid]\"\n    result = parse_dtype(dtype_str)\n    assert len(result) == 2\n    assert result[0] == {\n        \"registry_str\": \"bionty.CellType\",\n        \"filter_str\": \"\",\n        \"field_str\": \"uid\",\n        \"registry\": bt.CellType,\n        \"field\": bt.CellType.uid,\n    }\n    assert result[1] == {\n        \"registry_str\": \"bionty.CellLine\",\n        \"filter_str\": \"\",\n        \"field_str\": \"uid\",\n        \"registry\": bt.CellLine,\n        \"field\": bt.CellLine.uid,\n    }\n\n\ndef test_invalid_registry():\n    dtype_str = \"cat[InvalidRegistry.field]\"\n    with pytest.raises(ValidationError) as exc_info:\n        parse_dtype(dtype_str)\n    assert \"invalid dtype\" in str(exc_info.value)\n\n\ndef test_empty_category():\n    dtype_str = \"cat[]\"\n    result = parse_dtype(dtype_str)\n    assert result == []\n\n\ndef test_url_dtype_is_supported():\n    assert parse_dtype(\"url\") == []\n    feature = ln.Feature(name=\"website\", dtype=\"url\")\n    assert feature._dtype_str == \"url\"\n\n\ndef test_malformed_categorical():\n    dtype_str = \"cat ? str\"\n    with pytest.raises(ValueError) as err:\n        parse_dtype(dtype_str)\n    assert err.exconly().startswith(\n        f\"ValueError: dtype is '{dtype_str}' but has to be one of\"\n    )\n    dtype_str = \"cat[Record[Customer.name\"\n    with pytest.raises(ValueError) as err:\n        parse_dtype(dtype_str)\n    assert err.exconly().startswith(\n        f\"ValueError: dtype is '{dtype_str}' but has to be one of\"\n    )\n\n\ndef test_simple_registry_without_field():\n    dtype_str = \"cat[Record]\"\n    result = parse_dtype(dtype_str)\n    assert len(result) == 1\n    assert result[0] == {\n        \"registry_str\": \"Record\",\n        \"filter_str\": \"\",\n        \"field_str\": \"name\",\n        \"registry\": Record,\n        \"field\": Record.name,\n    }\n\n\ndef test_registry_with_subtype_no_field():\n    # Create a Record type to get its UID\n    customer_type = ln.Record(name=\"Customer\", is_type=True).save()\n    dtype_str = f\"cat[Record[{customer_type.uid}]]\"\n    result = parse_dtype(dtype_str)\n    assert len(result) == 1\n    assert result[0] == {\n        \"registry_str\": \"Record\",\n        \"filter_str\": \"\",\n        \"field_str\": \"name\",\n        \"registry\": Record,\n        \"field\": Record.name,\n        \"record_uid\": customer_type.uid,\n    }\n    customer_type.delete(permanent=True)\n\n\ndef test_list_of_dtypes():\n    # Create a Record type to get its UID\n    customer_type = ln.Record(name=\"Customer\", is_type=True).save()\n    dtype_str = f\"list[cat[Record[{customer_type.uid}]]]\"\n    result = parse_dtype(dtype_str)\n    assert len(result) == 1\n    assert result[0] == {\n        \"registry_str\": \"Record\",\n        \"filter_str\": \"\",\n        \"field_str\": \"name\",\n        \"registry\": Record,\n        \"field\": Record.name,\n        \"record_uid\": customer_type.uid,\n        \"list\": True,\n    }\n    assert serialize_dtype(list[bt.CellLine]) == \"list[cat[bionty.CellLine]]\"\n    customer_type.delete(permanent=True)\n\n\ndef test_registry_with_filter():\n    dtype_str = \"cat[bionty.Gene.ensembl_gene_id[source__id='abcd']]\"\n    result = parse_dtype(dtype_str)\n    assert len(result) == 1\n    assert result[0] == {\n        \"registry_str\": \"bionty.Gene\",\n        \"filter_str\": \"source__id='abcd'\",\n        \"field_str\": \"ensembl_gene_id\",\n        \"registry\": bt.Gene,\n        \"field\": bt.Gene.ensembl_gene_id,\n    }\n\n\ndef test_nested_cat_dtypes():\n    # Create Record types - the deepest type is UScustomer\n    customer_type = ln.Record(name=\"Customer\", is_type=True).save()\n    uscustomer_type = ln.Record(\n        name=\"UScustomer\", type=customer_type, is_type=True\n    ).save()\n    dtype_str = f\"cat[Record[{uscustomer_type.uid}].name]\"\n    result = parse_dtype(dtype_str)\n    assert len(result) == 1\n    assert result[0] == {\n        \"registry_str\": \"Record\",\n        \"filter_str\": \"\",\n        \"field_str\": \"name\",\n        \"registry\": Record,\n        \"field\": Record.name,\n        \"record_uid\": uscustomer_type.uid,\n    }\n    uscustomer_type.delete(permanent=True)\n    customer_type.delete(permanent=True)\n\n\ndef test_nested_cat_with_filter():\n    # Create Record types - the deepest type is UScustomer\n    # Note: filters in bracket content are not currently supported in UID format\n    # This test may need adjustment based on how filters are handled\n    customer_type = ln.Record(name=\"Customer\", is_type=True).save()\n    uscustomer_type = ln.Record(\n        name=\"UScustomer\", type=customer_type, is_type=True\n    ).save()\n    dtype_str = f\"cat[Record[{uscustomer_type.uid}].description]\"\n    result = parse_dtype(dtype_str)\n    assert len(result) == 1\n    assert result[0] == {\n        \"registry_str\": \"Record\",\n        \"filter_str\": \"\",\n        \"field_str\": \"description\",\n        \"registry\": Record,\n        \"field\": Record.description,\n        \"record_uid\": uscustomer_type.uid,\n    }\n    uscustomer_type.delete(permanent=True)\n    customer_type.delete(permanent=True)\n\n\n# -----------------------------------------------------------------------------\n# parsing django filter expressions\n# -----------------------------------------------------------------------------\n\n\ndef test_feature_dtype():\n    feature = ln.Feature(\n        name=\"disease\",\n        dtype=bt.Disease,\n        cat_filters={\n            \"source__uid\": \"4a3ejKuf\"\n        },  # uid corresponds to disease_ontology_old.uid\n    ).save()\n\n    result = parse_dtype(feature._dtype_str)\n    assert len(result) == 1\n    assert result[0] == {\n        \"registry_str\": \"bionty.Disease\",\n        \"filter_str\": \"source__uid='4a3ejKuf'\",\n        \"field_str\": \"name\",\n        \"registry\": bt.Disease,\n        \"field\": bt.Disease.name,\n    }\n\n    feature.delete(permanent=True)\n\n\ndef test_cat_filters_incompatible_with_union_dtypes():\n    with pytest.raises(ValidationError) as exc_info:\n        ln.Feature(\n            name=\"test_feature\",\n            dtype=\"cat[Record|bionty.CellType]\",\n            cat_filters={\"source\": \"test\"},\n        )\n    assert (\n        \"cat_filters are incompatible with union dtypes: 'cat[Record|bionty.CellType]'\"\n        in str(exc_info.value)\n    )\n\n\ndef test_cat_filters_incompatible_with_nested_dtypes():\n    record = ln.Record(name=\"Customer\", is_type=True).save()\n    with pytest.raises(ValidationError) as exc_info:\n        ln.Feature(\n            name=\"test_feature\",\n            dtype=record,\n            cat_filters={\"source\": \"test\"},\n        )\n    assert (\n        f\"cat_filters are incompatible with nested dtypes: 'cat[Record[{record.uid}]]'\"\n        in str(exc_info.value)\n    )\n    record.delete(permanent=True)\n\n\ndef test_parse_filter_string_basic():\n    result = parse_filter_string(\"parent__id=123, category__name=electronics\")\n    expected = {\n        \"parent__id\": (\"parent\", \"id\", \"123\"),\n        \"category__name\": (\"category\", \"name\", \"electronics\"),\n    }\n    assert result == expected\n\n\ndef test_parse_filter_string_direct_fields():\n    result = parse_filter_string(\"name=test, status=active\")\n    expected = {\"name\": (\"name\", None, \"test\"), \"status\": (\"status\", None, \"active\")}\n    assert result == expected\n\n\ndef test_parse_filter_string_empty():\n    with pytest.raises(ValueError) as e:\n        parse_filter_string(\"\")\n        assert \"missing '=' sign\" in str(e)\n\n\ndef test_parse_filter_string_malformed():\n    with pytest.raises(ValueError) as e:\n        parse_filter_string(\"malformed_filter\")\n        assert \"missing '=' sign\" in str(e)\n\n\ndef test_parse_filter_string_missing_key():\n    with pytest.raises(ValueError) as e:\n        parse_filter_string(\"=someval\")\n        assert \"empty key\" in str(e)\n\n\ndef test_parse_filter_string_missing_value():\n    with pytest.raises(ValueError) as e:\n        parse_filter_string(\"somekey=\")\n        assert \"empty val\" in str(e)\n\n\ndef test_resolve_direct_fields():\n    parsed = {\"name\": (\"name\", None, \"test\"), \"status\": (\"status\", None, \"active\")}\n    result = resolve_relation_filters(parsed, bt.Gene)\n    assert result == {\"name\": \"test\", \"status\": \"active\"}\n\n\ndef test_resolve_relation_filter_with_uid():\n    source = bt.Source(\n        name=\"test_name\",\n        description=\"test_description\",\n        organism=\"human\",\n        entity=\"bionty.Gene\",\n        version=\"2026-01-01\",\n    )\n    source.uid = \"testuid1\"\n    source.save()\n    parsed = {\"source__uid\": (\"source\", \"uid\", \"testuid1\")}\n    result = resolve_relation_filters(parsed, bt.Gene)\n    print(result)\n    assert result == {\"source\": source}\n    source.delete(permanent=True)\n\n\ndef test_resolve_relation_filter_with_name(organism):\n    parsed = {\"organism__name\": (\"organism\", \"name\", \"test_organism\")}\n    result = resolve_relation_filters(parsed, bt.Gene)\n    assert result == {\"organism\": organism}\n    organism.delete(permanent=True)\n\n\ndef test_resolve_multiple_relation_filters(organism):\n    source = bt.Source(\n        name=\"test_name\",\n        description=\"test_description\",\n        organism=\"human\",\n        entity=\"bionty.Gene\",\n        version=\"2026-01-01\",\n    )\n    source.uid = \"testuid1\"\n    source.save()\n    parsed = {\n        \"organism__name\": (\"organism\", \"name\", \"test_organism\"),\n        \"source__uid\": (\"source\", \"uid\", \"testuid1\"),\n    }\n    result = resolve_relation_filters(parsed, bt.Gene)\n    assert result == {\"organism\": organism, \"source\": source}\n    source.delete(permanent=True)\n    organism.delete(permanent=True)\n\n\ndef test_resolve_nested_filter(organism):\n    parsed = {\"organism__name__contains\": (\"organism\", \"name__contains\", \"test_orga\")}\n    result = resolve_relation_filters(parsed, bt.Gene)\n    assert result == {\"organism\": organism}\n    organism.delete(permanent=True)\n\n\ndef test_resolve_relation_filter_failed_resolution():\n    parsed = {\"organism__name\": (\"organism\", \"name\", \"nonexistent\")}\n    with pytest.raises(bt.Organism.DoesNotExist):\n        resolve_relation_filters(parsed, bt.Gene)\n\n\ndef test_resolve_relation_filter_duplicate():\n    parsed = {\n        \"source__uid\": (\"source\", \"uid\", \"testuid1\"),\n        \"source__name\": (\"source\", \"name\", \"test_name\"),\n    }\n    with pytest.raises(bt.Source.DoesNotExist):\n        resolve_relation_filters(parsed, bt.Gene)\n\n\n# -----------------------------------------------------------------------------\n# backward compatibility for old format strings\n# -----------------------------------------------------------------------------\n\n\ndef test_convert_old_format_ulabel_string():\n    \"\"\"Test converting old format ULabel string to object.\"\"\"\n    # Create a ULabel type\n    perturbation = ln.ULabel(name=\"Perturbation\", is_type=True).save()\n\n    # Convert old format string\n    dtype_obj = dtype_as_object(\"cat[ULabel[Perturbation]]\", old_format=True)\n\n    # Should return the ULabel object\n    assert dtype_obj == perturbation\n    assert hasattr(dtype_obj, \"uid\")\n\n    # Clean up\n    perturbation.delete(permanent=True)\n\n\ndef test_convert_old_format_record_string():\n    \"\"\"Test converting old format Record string to object.\"\"\"\n    # Create a Record type\n    sample_type = ln.Record(name=\"Sample\", is_type=True).save()\n\n    # Convert old format string\n    dtype_obj = dtype_as_object(\"cat[Record[Sample]]\", old_format=True)\n\n    # Should return the Record object\n    assert dtype_obj == sample_type\n    assert hasattr(dtype_obj, \"uid\")\n\n    # Clean up\n    sample_type.delete(permanent=True)\n\n\ndef test_convert_old_format_nested_record_string():\n    \"\"\"Test converting old format nested Record string to object.\"\"\"\n    # Create nested Record types\n    lab_type = ln.Record(name=\"LabA\", is_type=True).save()\n    experiment_type = ln.Record(name=\"Experiment\", type=lab_type, is_type=True).save()\n\n    # Convert old format string\n    dtype_obj = dtype_as_object(\"cat[Record[LabA[Experiment]]]\", old_format=True)\n\n    # Should return the nested Record object\n    assert dtype_obj == experiment_type\n    assert hasattr(dtype_obj, \"uid\")\n\n    # Clean up\n    experiment_type.delete(permanent=True)\n    lab_type.delete(permanent=True)\n\n\ndef test_convert_old_format_list_string():\n    \"\"\"Test converting old format list string to object.\"\"\"\n    # Create a ULabel type\n    perturbation = ln.ULabel(name=\"Perturbation\", is_type=True).save()\n\n    # Convert old format string with list wrapper\n    dtype_obj = dtype_as_object(\"list[cat[ULabel[Perturbation]]]\", old_format=True)\n\n    # Should return list[ULabel] type\n    assert hasattr(dtype_obj, \"__origin__\")\n    assert dtype_obj.__origin__ is list\n    # Get the inner type\n    from typing import get_args\n\n    inner_type = get_args(dtype_obj)[0]\n    assert inner_type == perturbation\n\n    # Clean up\n    perturbation.delete(permanent=True)\n\n\ndef test_feature_constructor_with_old_format_string(ccaplog):\n    \"\"\"Test Feature constructor with old format string raises deprecation warning.\"\"\"\n    # Create a ULabel type\n    perturbation = ln.ULabel(name=\"Perturbation\", is_type=True).save()\n\n    # Create feature with old format string\n    feature = ln.Feature(name=\"perturbation\", dtype=\"cat[ULabel[Perturbation]]\")\n    assert (\n        \"rather than passing a string 'cat[ULabel[Perturbation]]' to dtype, consider passing a Python object\"\n        in ccaplog.text\n    )\n\n    # Should have converted to UID format\n    assert feature._dtype_str is not None\n    assert \"ULabel[\" in feature._dtype_str\n    # Should contain UID, not name\n    assert \"Perturbation\" not in feature._dtype_str\n    assert perturbation.uid in feature._dtype_str\n\n    # Clean up\n    perturbation.delete(permanent=True)\n\n\ndef test_feature_constructor_with_old_format_nested_string(ccaplog):\n    \"\"\"Test Feature constructor with old format nested string.\"\"\"\n    # Create nested Record types\n    lab_type = ln.Record(name=\"LabA\", is_type=True).save()\n    experiment_type = ln.Record(name=\"Experiment\", type=lab_type, is_type=True).save()\n\n    # Create feature with old format nested string\n    feature = ln.Feature(name=\"experiment\", dtype=\"cat[Record[LabA[Experiment]]]\")\n    assert (\n        \"rather than passing a string 'cat[Record[LabA[Experiment]]]' to dtype, consider passing a Python object\"\n        in ccaplog.text\n    )\n\n    # Should have converted to UID format\n    assert feature._dtype_str is not None\n    assert \"Record[\" in feature._dtype_str\n    # Should contain UID, not names\n    assert \"LabA\" not in feature._dtype_str\n    assert \"Experiment\" not in feature._dtype_str\n    assert experiment_type.uid in feature._dtype_str\n\n    # Clean up\n    experiment_type.delete(permanent=True)\n    lab_type.delete(permanent=True)\n\n\ndef test_bare_cat_dtype_backward_compatibility():\n    \"\"\"Test that bare 'cat' dtype is accepted for backward compatibility.\"\"\"\n    # Test parse_dtype accepts \"cat\" and returns empty list\n    result = parse_dtype(\"cat\")\n    assert result == []\n\n    # Test Feature constructor with bare \"cat\" dtype issues deprecation warning\n    with pytest.warns(DeprecationWarning, match=\"dtype `cat` is deprecated\"):\n        feature = ln.Feature(name=\"test_bare_cat\", dtype=\"cat\")\n    assert feature._dtype_str == \"cat\"\n\n\ndef test_migrate_dtype_to_uid_format():\n    \"\"\"Test migrate_dtype_to_uid_format() function for migration.\"\"\"\n    from django.db import connection\n    from lamindb.models.feature import migrate_dtype_to_uid_format\n\n    # Create Record types for testing\n    lab_type = ln.Record(name=\"LabA\", is_type=True).save()\n    experiment_type = ln.Record(name=\"Experiment\", type=lab_type, is_type=True).save()\n    perturbation = ln.ULabel(name=\"Perturbation\", is_type=True).save()\n\n    # Create features with old format strings in _dtype_str\n    feature1 = ln.Feature(name=\"test_record_old_format\", dtype=\"str\").save()\n    feature2 = ln.Feature(name=\"test_ulabel_old_format\", dtype=\"str\").save()\n    feature3 = ln.Feature(name=\"test_list_record_old_format\", dtype=\"str\").save()\n    feature4 = ln.Feature(name=\"test_list_ulabel_old_format\", dtype=\"str\").save()\n\n    # Manually set old format strings using raw SQL\n    old_format_record = \"cat[Record[LabA[Experiment]]]\"\n    old_format_ulabel = \"cat[ULabel[Perturbation]]\"\n    old_format_list_record = \"list[cat[Record[LabA[Experiment]]]]\"\n    old_format_list_ulabel = \"list[cat[ULabel[Perturbation]]]\"\n\n    with connection.cursor() as cursor:\n        cursor.execute(\n            \"UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s\",\n            [old_format_record, feature1.id],\n        )\n        cursor.execute(\n            \"UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s\",\n            [old_format_ulabel, feature2.id],\n        )\n        cursor.execute(\n            \"UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s\",\n            [old_format_list_record, feature3.id],\n        )\n        cursor.execute(\n            \"UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s\",\n            [old_format_list_ulabel, feature4.id],\n        )\n\n    # Refresh features from database\n    feature1.refresh_from_db()\n    feature2.refresh_from_db()\n    feature3.refresh_from_db()\n    feature4.refresh_from_db()\n\n    # Verify old format is present\n    assert feature1._dtype_str == old_format_record\n    assert feature2._dtype_str == old_format_ulabel\n    assert feature3._dtype_str == old_format_list_record\n    assert feature4._dtype_str == old_format_list_ulabel\n\n    # Run migration function\n    migrate_dtype_to_uid_format(connection, input_field=\"_dtype_str\")\n\n    # Refresh features from database\n    feature1.refresh_from_db()\n    feature2.refresh_from_db()\n    feature3.refresh_from_db()\n    feature4.refresh_from_db()\n\n    # Verify conversion to UID format\n    assert feature1._dtype_str == f\"cat[Record[{experiment_type.uid}]]\"\n    assert feature2._dtype_str == f\"cat[ULabel[{perturbation.uid}]]\"\n    assert feature3._dtype_str == f\"list[cat[Record[{experiment_type.uid}]]]\"\n    assert feature4._dtype_str == f\"list[cat[ULabel[{perturbation.uid}]]]\"\n\n    # Verify old names are not in the converted strings\n    assert \"LabA\" not in feature1._dtype_str\n    assert \"Experiment\" not in feature1._dtype_str\n    assert \"Perturbation\" not in feature2._dtype_str\n    assert \"LabA\" not in feature3._dtype_str\n    assert \"Experiment\" not in feature3._dtype_str\n    assert \"Perturbation\" not in feature4._dtype_str\n\n    # Verify UIDs are present\n    assert experiment_type.uid in feature1._dtype_str\n    assert perturbation.uid in feature2._dtype_str\n    assert experiment_type.uid in feature3._dtype_str\n    assert perturbation.uid in feature4._dtype_str\n\n    # Clean up\n    feature1.delete(permanent=True)\n    feature2.delete(permanent=True)\n    feature3.delete(permanent=True)\n    feature4.delete(permanent=True)\n    experiment_type.delete(permanent=True)\n    lab_type.delete(permanent=True)\n    perturbation.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_from_values.py",
    "content": "import bionty as bt\nimport lamindb as ln\nimport pandas as pd\nimport pytest\n\n\n@pytest.fixture(scope=\"module\")\ndef df():\n    return pd.DataFrame(\n        (\n            [\"T cell\", \"CL:0000084\"],\n            [\"hepatocyte\", \"CL:0000182\"],\n            [\"my new cell type\", \"\"],\n        ),\n        columns=[\"cell_type\", \"cell_type_id\"],\n    )\n\n\ndef test_from_values_name(df):\n    bt.CellType.filter().delete(permanent=True)\n    assert df[\"cell_type\"].tolist() == [\"T cell\", \"hepatocyte\", \"my new cell type\"]\n    # create records from bionty\n    result = bt.CellType.from_values(df.cell_type, \"name\")\n    ids = [i.ontology_id for i in result]\n    assert len(result) == 2\n    assert set(ids) == {\"CL:0000084\", \"CL:0000182\"}\n    assert result[0].source.entity == \"bionty.CellType\"\n\n    # wrong field type\n    with pytest.raises(TypeError):\n        result = bt.CellType.from_values(df.cell_type, field=bt.CellType)\n\n\ndef test_from_values_ontology_id(df):\n    assert df[\"cell_type_id\"].tolist() == [\"CL:0000084\", \"CL:0000182\", \"\"]\n    result = bt.CellType.from_values(df.cell_type_id, \"ontology_id\")\n    names = {i.name for i in result}\n    assert len(result) == 2\n    assert names == {\"T cell\", \"hepatocyte\"}\n    assert result[0].source.entity == \"bionty.CellType\"\n\n\ndef test_from_values_multiple_match():\n    records = bt.Gene.from_values([\"ABC1\", \"PDCD1\"], bt.Gene.symbol, organism=\"human\")\n    assert len(records) == 3\n\n\ndef test_get_or_create_records():\n    names = [\"record\" + str(i) for i in range(25)]\n    labels = [ln.Record(name=name) for name in names]\n    ln.save(labels)\n    # more than 20 existing values\n    labels = ln.Record.from_values(names, field=\"name\")\n    assert len(labels) == 25\n\n\ndef test_from_values_synonyms_aware():\n    bt.CellType.from_source(name=\"T cell\").save()\n    # existing validated values\n    records = bt.CellType.from_values([\"T cell\"], \"name\")\n    assert len(records) == 1\n    assert records[0].name == \"T cell\"\n    assert isinstance(records[0].source, bt.Source)\n    # existing validated values and synonyms\n    records = bt.CellType.from_values([\"T cell\", \"T-cell\"], \"name\")\n    assert len(records) == 1\n    assert records[0].name == \"T cell\"\n    assert isinstance(records[0].source, bt.Source)\n    # bionty values and synonyms\n    records = bt.CellType.from_values([\"B-cell\", \"B cell\"], \"name\")\n    assert len(records) == 1\n    assert records[0].name == \"B cell\"\n    assert isinstance(records[0].source, bt.Source)\n    # all possibilities of validated values\n    records = bt.CellType.from_values(\n        [\"T cell\", \"T-cell\", \"t cell\", \"B cell\", \"B-cell\"], \"name\"\n    )\n    assert len(records) == 2\n    names = [r.name for r in records]\n    assert set(names) == {\"T cell\", \"B cell\"}\n    assert isinstance(records[0].source, bt.Source)\n    assert isinstance(records[1].source, bt.Source)\n    # non-validated values\n    records = bt.CellType.from_values([\"T cell\", \"mycell\"], \"name\")\n    assert len(records) == 1\n    assert records[0].name == \"T cell\"\n    assert isinstance(records[0].source, bt.Source)\n    assert records[0].ontology_id == \"CL:0000084\"\n    bt.CellType.filter().delete(permanent=True)\n\n\ndef test_standardize():\n    # only name field can be standardized\n    results = bt.Gene.from_values(\n        [\"HES4\", \"TNFRSF4\"], field=bt.Gene.ensembl_gene_id, organism=\"human\"\n    )\n    assert len(results) == 0\n\n    results = bt.Gene.from_values(\n        [\"HES4\", \"TNFRSF4\"], field=bt.Gene.symbol, organism=\"human\"\n    )\n    assert len(results) == 2\n\n\ndef test_from_values_no_source():\n    # remove source of ExperimentalFactor\n    source = bt.Source.filter(entity=\"bionty.ExperimentalFactor\").first()\n    source.delete(permanent=True)\n    assert not bt.ExperimentalFactor.from_values([\"scrnaseq\"])\n    source.save()\n"
  },
  {
    "path": "tests/core/test_has_parents.py",
    "content": "import bionty as bt\nimport lamindb as ln\n\n\ndef test_view_parents():\n    label1 = ln.Record(name=\"label1\")\n    label2 = ln.Record(name=\"label2\")\n    label1.save()\n    label2.save()\n    label1.parents.add(label2)\n    label1.view_parents(ln.Record.name, distance=1)\n    label1.delete(permanent=True)\n    label2.delete(permanent=True)\n\n\ndef test_query_parents_children():\n    label1 = ln.Record(name=\"label1\").save()\n    label2 = ln.Record(name=\"label2\").save()\n    label3 = ln.Record(name=\"label3\").save()\n    label1.children.add(label2)\n    label2.children.add(label3)\n    parents = label3.query_parents()\n    assert len(parents) == 2\n    assert label1 in parents and label2 in parents\n    children = label1.query_children()\n    assert len(children) == 2\n    assert label2 in children and label3 in children\n    label1.delete(permanent=True)\n    label2.delete(permanent=True)\n    label3.delete(permanent=True)\n\n\ndef test_view_lineage_circular():\n    import pandas as pd\n\n    transform = ln.Transform(key=\"test\").save()\n    run = ln.Run(transform=transform).save()\n    artifact = ln.Artifact.from_dataframe(\n        pd.DataFrame({\"a\": [1, 2, 3]}), description=\"test artifact\", run=run\n    ).save()\n    run.input_artifacts.add(artifact)\n    artifact.view_lineage()\n    artifact.delete(permanent=True)\n    transform.delete(permanent=True)\n\n\ndef test_view_parents_connected_instance():\n    ct = bt.CellType.connect(\"laminlabs/cellxgene\").first()\n\n    if ct and hasattr(ct, \"parents\"):\n        ct.view_parents(distance=2, with_children=True)\n\n\ndef test_query_relatives_connected_instance():\n    ct = bt.CellType.connect(\"laminlabs/cellxgene\").filter(name=\"T cell\").first()\n\n    if ct:\n        parents = ct.query_parents()\n        assert parents.db == \"laminlabs/cellxgene\"\n\n        children = ct.query_children()\n        assert children.db == \"laminlabs/cellxgene\"\n\n\ndef test_view_lineage_connected_instance():\n    af = ln.Artifact.connect(\"laminlabs/cellxgene\").first()\n\n    if af and af.run:\n        af.view_lineage()\n"
  },
  {
    "path": "tests/core/test_has_type.py",
    "content": "import os\n\nimport lamindb as ln\nimport pytest\nfrom django.db import IntegrityError\n\n\n@pytest.mark.parametrize(\n    \"model_class,extra_kwargs\",\n    [\n        (ln.Record, {}),\n        (ln.Feature, {\"dtype\": \"str\"}),\n        (ln.Schema, {\"itype\": ln.Feature}),\n        (ln.Project, {}),\n        (ln.Reference, {}),\n        (ln.ULabel, {}),\n    ],\n)\ndef test_invalid_type(model_class, extra_kwargs):\n    # also see test_invalid_type_record_with_schema in test_record.py\n    model_name = model_class.__name__.lower()\n\n    no_type = model_class(name=\"no_type\", **extra_kwargs).save()\n    if model_name == \"schema\":\n        extra_kwargs[\"is_type\"] = True  # to avoid triggering hash look up\n    with pytest.raises(ValueError) as error:\n        model_class(name=\"WithInvalidType\", type=no_type, **extra_kwargs).save()\n    assert error.exconly().startswith(\n        f\"ValueError: You can only assign a {model_name} with `is_type=True` as `type` to another {model_name}\"\n    )\n    # test at the database level\n    if os.getenv(\"LAMINDB_TEST_DB_VENDOR\") != \"sqlite\":\n        no_type.is_type = True\n        with pytest.raises(IntegrityError) as error:\n            model_class(name=\"WithInvalidType\", type=no_type, **extra_kwargs).save()\n        assert f\"{model_name}_type_is_valid_fk\" in error.exconly()\n    no_type.delete(permanent=True)\n\n\n@pytest.mark.skipif(\n    os.getenv(\"LAMINDB_TEST_DB_VENDOR\") == \"sqlite\", reason=\"Postgres-only\"\n)\n@pytest.mark.parametrize(\"model_class\", [ln.Record, ln.ULabel])\ndef test_prevent_type_cycle(model_class):\n    type_a = model_class(name=\"TypeA\", is_type=True).save()\n    type_b = model_class(name=\"TypeB\", is_type=True).save()\n\n    # Set A's parent to B\n    type_a.type = type_b\n    type_a.save()  # A → B, this is fine\n\n    # Try to set B's parent to A (would create cycle B → A → B)\n    type_b.type = type_a\n\n    with pytest.raises(Exception) as exc_info:\n        type_b.save()\n\n    assert \"cycle\" in str(exc_info.value).lower()\n\n    # Try to set type to itself\n    type_a.type = type_a\n\n    with pytest.raises(Exception) as exc_info:\n        type_a.save()\n\n    assert \"cycle\" in str(exc_info.value).lower()\n\n    type_a.delete(permanent=True)\n    type_b.delete(permanent=True)\n\n\n@pytest.mark.parametrize(\"model_class\", [ln.Record, ln.ULabel, ln.Project])\ndef test_query_sub_types_super_types_instances(model_class):\n    model_name = model_class.__name__.lower()\n\n    # Create type hierarchy\n    type1 = model_class(name=\"Type1\", is_type=True).save()\n    type2 = model_class(name=\"Type2\", is_type=True, type=type1).save()\n    type3 = model_class(name=\"Type3\", is_type=True, type=type2).save()\n\n    # Create instances\n    instance1 = model_class(name=f\"{model_name}1\", type=type1).save()\n    instance2 = model_class(name=f\"{model_name}2\", type=type3).save()\n    instance3 = model_class(name=f\"{model_name}3\", type=type3).save()\n\n    # Get the query method dynamically\n    query_method = getattr(type1, f\"query_{model_name}s\")\n\n    # Children\n    assert getattr(type1, model_name + \"s\").count() == 2  # direct instances\n    assert query_method().count() == 5\n\n    # Super types\n    super_types = instance3.query_types()\n    assert len(super_types) == 3\n    assert super_types[0] == type3\n    assert super_types[1] == type2\n    assert super_types[2] == type1\n\n    # Move type2 to trash\n    type2.delete()\n    assert query_method().count() == 1\n\n    # Cleanup\n    instance1.delete(permanent=True)\n    instance2.delete(permanent=True)\n    instance3.delete(permanent=True)\n    type3.delete(permanent=True)\n    type2.delete(permanent=True)\n    type1.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_integrity.py",
    "content": "import lamindb_setup as ln_setup\n\n\ndef test_migrate_check():\n    assert ln_setup.migrate.check()\n\n\ndef test_system_check():\n    ln_setup.django(\"check\")\n"
  },
  {
    "path": "tests/core/test_is_versioned.py",
    "content": "import lamindb as ln\nimport pandas as pd\nimport pytest\nfrom lamindb.models._is_versioned import (\n    _adjust_is_latest_when_deleting_is_versioned,\n    bump_version,\n    set_version,\n)\n\n\n@pytest.fixture(scope=\"module\")\ndef df1():\n    return pd.DataFrame({\"feat1\": [1, 2]})\n\n\n@pytest.fixture(scope=\"module\")\ndef df2():\n    return pd.DataFrame({\"feat1\": [2, 3]})\n\n\ndef test_set_version():\n    # all remaining lines are covered in notebooks\n    with pytest.raises(ValueError):\n        set_version(None, \"weird-version\")\n    assert set_version(None, \"1.2\") == \"2\"\n    assert set_version(None, \"0\") == \"1\"\n    assert set_version(None, \"1\") == \"2\"\n    assert set_version(\"1.2.3\", \"0\") == \"1.2.3\"\n    assert set_version(\"1.2.3\") == \"1.2.3\"\n\n\ndef test_bump_version():\n    current_version_major_only = \"2\"\n    current_version_major_minor = \"2.1\"\n    weird_version = \"weird-version\"\n    with pytest.raises(ValueError):\n        bump_version(weird_version)\n    assert bump_version(weird_version, behavior=\"ignore\") == \"?\"\n    assert bump_version(current_version_major_only, bump_type=\"major\") == \"3\"\n    assert bump_version(current_version_major_only, bump_type=\"minor\") == \"2.1\"\n    assert bump_version(current_version_major_minor, bump_type=\"major\") == \"3\"\n    assert bump_version(current_version_major_minor, bump_type=\"minor\") == \"2.2\"\n\n\ndef test_add_to_version_family(df1, df2):\n    artifact1 = ln.Artifact.from_dataframe(df1, description=\"test1\").save()\n    artifact2 = ln.Artifact.from_dataframe(df2, description=\"test2\").save()\n    assert (\n        artifact1.uid[: artifact1._len_stem_uid]\n        != artifact2.uid[: artifact2._len_stem_uid]\n    )\n    artifact2._add_to_version_family(artifact1)\n    assert (\n        artifact1.uid[: artifact1._len_stem_uid]\n        == artifact2.uid[: artifact2._len_stem_uid]\n    )\n    assert (\n        artifact1.path.name[: artifact1._len_stem_uid]\n        == artifact2.path.name[: artifact2._len_stem_uid]\n    )\n    artifact1.delete(permanent=True)\n    artifact2.delete(permanent=True)\n\n\ndef test_transform_versioning_based_on_key():\n    transform1 = ln.Transform(\n        key=\"test-pipeline\",\n        version=\"1.0\",\n        source_code=\"1\",\n        kind=\"pipeline\",\n    ).save()\n    assert transform1.is_latest\n    assert transform1.version_tag == \"1.0\"\n    assert transform1.version == \"1.0\"\n\n    with pytest.raises(ValueError) as e:\n        transform2 = ln.Transform(\n            key=\"test-pipeline\",\n            version=\"1.0\",\n            source_code=\"2\",\n            kind=\"pipeline\",\n        ).save()\n    assert (\n        e.exconly()\n        == \"ValueError: Please change the version tag or leave it `None`, '1.0' is already taken\"\n    )\n\n    transform2 = ln.Transform(\n        key=\"test-pipeline\",\n        # do not pass the version tag, which corresponds to: version=None\n        source_code=\"2\",\n        kind=\"pipeline\",\n    ).save()\n\n    assert transform2.version_tag is None\n    assert transform2.version == transform2.uid[-4:]  # version falls back to uid suffix\n    assert transform2.is_latest\n    assert transform2.hash != transform1.hash\n    assert not ln.Transform.get(key=\"test-pipeline\", version=\"1.0\").is_latest\n\n    transform3 = ln.Transform(\n        key=\"test-pipeline\",\n        version=\"abcd\",  # mimic commit hash\n        source_code=\"3\",\n        kind=\"pipeline\",\n    ).save()\n\n    assert transform3.version_tag == \"abcd\"\n    assert transform3.version == \"abcd\"\n    assert transform3.is_latest\n    assert transform3.hash != transform2.hash\n    assert not ln.Transform.get(key=\"test-pipeline\", source_code=\"2\").is_latest\n\n\ndef test_transform_versioning_based_on_revises():\n    # build one version family\n    transform_v1 = ln.Transform(key=\"Introduction\").save()\n    assert transform_v1.is_latest\n    assert transform_v1.version_tag is None\n\n    # pass the latest version\n    transform_v2 = ln.Transform(\n        key=\"Introduction v2\", revises=transform_v1, version=\"2\"\n    ).save()\n    assert not transform_v1.is_latest\n    assert transform_v2.is_latest\n    assert transform_v2.uid.endswith(\"0001\")\n    assert transform_v2.version_tag == \"2\"\n    assert transform_v2.version == \"2\"\n\n    # consciously *not* pass the latest version to revises but the previous\n    # it automatically retrieves the latest version\n    transform_v3 = ln.Transform(key=\"Introduction\", revises=transform_v1).save()\n    assert transform_v3.uid.endswith(\"0002\")\n    assert not ln.Transform.get(key=\"Introduction v2\", version=\"2\").is_latest\n    assert transform_v3.is_latest\n    # no source code code was yet saved, returning existing transform with same key\n    transform_v4 = ln.Transform(key=\"Introduction\").save()\n    assert transform_v4 == transform_v3\n\n    assert len(ln.Transform.filter(key=\"Introduction\")) == 2\n    assert len(ln.Transform.filter(key=\"Introduction\").filter(is_latest=True)) == 1\n    assert ln.Transform.get(key=\"Introduction\") == transform_v3\n    assert ln.Transform.filter(key=\"Introduction\").get(is_latest=True) == transform_v3\n\n    # test get\n    assert ln.Transform.get(transform_v3.uid) == transform_v3\n    assert ln.Transform.get(transform_v3.id) == transform_v3\n    assert ln.Transform.get(transform_v3.uid[:-4]) == transform_v3\n\n    # test empty QuerySet\n    assert (\n        ln.Transform.filter(key=\"IntroductionNotExists\")\n        .filter(is_latest=True)\n        .one_or_none()\n        is None\n    )\n\n    # test soft delete\n    transform_v3.delete()\n    assert transform_v2.is_latest\n\n    # test hard delete\n    transform_v2.delete(permanent=True)\n    assert (\n        transform_v1_retrieved := ln.Transform.get(transform_v3.uid[:-4])\n    ) == transform_v1\n    assert transform_v1_retrieved.is_latest\n\n    # test soft delete on the last existing version does not change is_latest\n    transform_v1_retrieved.delete()\n    assert (\n        transform_v1_retrieved := ln.Transform.get(transform_v1.uid)\n    ) == transform_v1\n    assert transform_v1_retrieved.is_latest\n\n    # fully delete\n    transform_v1.delete(permanent=True)\n\n    # last object that exists is in the trash\n    assert ln.Transform.get(transform_v3.uid[:-4]) == transform_v3\n    assert transform_v3.branch_id == -1\n    transform_v3.delete(permanent=True)\n\n\ndef test_transform_versioning_across_branches_preserves_main_latest():\n    main_branch = ln.Branch.get(name=\"main\")\n    ln.setup.switch(main_branch.name)\n    branch = ln.Branch(name=\"test_versioning_branch_latest\").save()\n    transform_v1 = ln.Transform(\n        key=\"test-branch-aware-is-latest\",\n        source_code=\"main-v1\",\n        kind=\"pipeline\",\n    ).save()\n    try:\n        ln.setup.switch(branch.name)\n        transform_v2 = ln.Transform(\n            key=\"test-branch-aware-is-latest\",\n            revises=transform_v1,\n            source_code=\"feature-v2\",\n            kind=\"pipeline\",\n        ).save()\n        transform_v1.refresh_from_db()\n        assert transform_v1.is_latest\n        assert transform_v2.is_latest\n\n        # Passing an older revises still increments from the family max uid.\n        transform_v3 = ln.Transform(\n            key=\"test-branch-aware-is-latest\",\n            revises=transform_v1,\n            source_code=\"feature-v3\",\n            kind=\"pipeline\",\n        ).save()\n        transform_v2.refresh_from_db()\n        transform_v1.refresh_from_db()\n        assert transform_v3.uid.endswith(\"0002\")\n        assert not transform_v2.is_latest\n        assert transform_v3.is_latest\n        assert transform_v1.is_latest\n    finally:\n        ln.setup.switch(main_branch.name)\n        for uid in (transform_v1.uid[:-4],):\n            for record in ln.Transform.objects.filter(uid__startswith=uid):\n                record.delete(permanent=True)\n        branch.delete(permanent=True)\n\n\ndef test_path_rename():\n    # this is related to renames inside _add_to_version_family\n    with open(\"test_new_path.txt\", \"w\") as f:\n        f.write(\"test_new_path\")\n    old_path = ln.UPath(\"s3://lamindata/.lamindb/test_new_path.txt\")\n    old_path.upload_from(\"./test_new_path.txt\")\n    assert old_path.exists()\n    new_path = old_path.rename(old_path.with_name(\"test_new_path2.txt\"))\n    assert new_path.exists()\n    assert new_path.as_posix() == \"s3://lamindata/.lamindb/test_new_path2.txt\"\n    assert not old_path.exists()\n    new_path.unlink()\n    ln.UPath(\"./test_new_path.txt\").unlink()\n\n\ndef test_version_backward_compatibility():\n    \"\"\"Test that queries using version= still work (backward compatibility).\"\"\"\n    # Create transforms with different versions and source_code to avoid deduplication\n    transform1 = ln.Transform(\n        key=\"test-backward-compat\",\n        version=\"1.0\",\n        kind=\"pipeline\",\n        source_code=\"code1\",\n    ).save()\n    transform2 = ln.Transform(\n        key=\"test-backward-compat\",\n        version=\"2.0\",\n        kind=\"pipeline\",\n        source_code=\"code2\",\n    ).save()\n\n    # Test that we can query using version= (old API)\n    found = ln.Transform.get(key=\"test-backward-compat\", version=\"1.0\")\n    assert found == transform1\n    assert found.version_tag == \"1.0\"\n    assert found.version == \"1.0\"\n\n    found = ln.Transform.get(key=\"test-backward-compat\", version=\"2.0\")\n    assert found == transform2\n    assert found.version_tag == \"2.0\"\n    assert found.version == \"2.0\"\n\n    # Test filter with version=\n    results = ln.Transform.filter(key=\"test-backward-compat\", version=\"1.0\")\n    assert len(results) == 1\n    assert results.first() == transform1\n\n    # Test with Artifact\n    artifact1 = ln.Artifact.from_dataframe(\n        pd.DataFrame({\"col1\": [1, 2]}), key=\"test-artifact.parquet\", version=\"1.0\"\n    ).save()\n    artifact2 = ln.Artifact.from_dataframe(\n        pd.DataFrame({\"col1\": [3, 4]}), key=\"test-artifact.parquet\", version=\"2.0\"\n    ).save()\n\n    found_artifact = ln.Artifact.get(key=\"test-artifact.parquet\", version=\"1.0\")\n    assert found_artifact == artifact1\n    assert found_artifact.version_tag == \"1.0\"\n    assert found_artifact.version == \"1.0\"\n\n    found_artifact = ln.Artifact.get(key=\"test-artifact.parquet\", version=\"2.0\")\n    assert found_artifact == artifact2\n    assert found_artifact.version_tag == \"2.0\"\n    assert found_artifact.version == \"2.0\"\n\n    # Cleanup\n    transform1.delete(permanent=True)\n    transform2.delete(permanent=True)\n    artifact1.delete(permanent=True)\n    artifact2.delete(permanent=True)\n\n\ndef test_adjust_is_latest_when_deleting_is_versioned():\n    \"\"\"Direct unit test for _adjust_is_latest_when_deleting_is_versioned (covers multiple promoted).\"\"\"\n    # Build two version families, each with v1 (older) and v2 (latest)\n    v1a = ln.Transform(key=\"Adjust latest family A\").save()\n    v2a = ln.Transform(revises=v1a, key=\"Adjust latest family A\").save()\n    v1b = ln.Transform(key=\"Adjust latest family B\").save()\n    v2b = ln.Transform(revises=v1b, key=\"Adjust latest family B\").save()\n    assert v2a.is_latest and v2b.is_latest\n    assert not v1a.is_latest and not v1b.is_latest\n\n    # Delete both latest → two promoted (covers \"new latest ... versions: [...]\" branch)\n    promoted = _adjust_is_latest_when_deleting_is_versioned([v2a, v2b])\n    assert len(promoted) == 2\n    assert set(promoted) == {v1a.pk, v1b.pk}\n\n    v1a.refresh_from_db()\n    v1b.refresh_from_db()\n    assert v1a.is_latest and v1b.is_latest\n\n    # Edge case: empty list returns []\n    assert _adjust_is_latest_when_deleting_is_versioned([]) == []\n\n    # Clean up\n    v2a.delete(permanent=True)\n    v2b.delete(permanent=True)\n    v1a.delete(permanent=True)\n    v1b.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_label_manager.py",
    "content": "from pathlib import Path\n\nimport bionty as bt\nimport lamindb as ln\nimport pytest\nfrom _dataset_fixtures import (  # noqa\n    get_mini_csv,\n)\nfrom lamindb.errors import ValidationError\nfrom lamindb.models.artifact import add_labels\n\n\n@pytest.fixture(scope=\"module\")\ndef adata():\n    adata = ln.examples.datasets.anndata_with_obs()\n    # add another column\n    adata.obs[\"cell_type_by_expert\"] = adata.obs[\"cell_type\"]\n    adata.obs.loc[\"obs0\", \"cell_type_by_expert\"] = \"B cell\"\n    return adata\n\n\ndef test_labels_add(adata):\n    label = ln.Record(name=\"Experiment 1\")\n    artifact = ln.Artifact.from_anndata(adata, description=\"test\").save()\n    experiment = ln.Feature(name=\"experiment\", dtype=ln.Record)\n    with pytest.raises(ValueError) as error:\n        artifact.labels.add(\"experiment_1\", experiment)\n    assert (\n        error.exconly()\n        == \"ValueError: Please pass a record (a `SQLRecord` object), not a string, e.g.,\"\n        \" via: label = ln.Record(name='experiment_1')\"\n    )\n    with pytest.raises(ValidationError) as error:\n        artifact.labels.add(label, experiment)\n    assert \"not validated. If it looks correct: record.save()\" in error.exconly()\n    label.save()\n    with pytest.raises(TypeError) as error:\n        artifact.labels.add(label, \"experiment 1\")\n    with pytest.raises(ValidationError) as error:\n        artifact.labels.add(label, feature=experiment)\n    assert (\n        error.exconly()\n        == \"lamindb.errors.ValidationError: Feature not validated. If it looks\"\n        \" correct: ln.Feature(name='experiment', type='cat[Record]').save()\"\n    )\n    experiment.save()\n\n    # try to pass list of length zero\n    artifact.labels.add([], feature=experiment)\n    # now pass a single label\n    artifact.labels.add(label, feature=experiment)\n    # check that the feature was updated with type = \"Record\"\n    feature = ln.Feature.get(name=\"experiment\")\n    assert feature._dtype_str == \"cat[Record]\"\n    with pytest.raises(TypeError):\n        experiments = artifact.labels.get(\"experiment\")\n    # check that the label is there, it's exactly one label with name \"Experiment 1\"\n    experiments = artifact.labels.get(experiment)\n    assert experiments.one().name == \"Experiment 1\"\n\n    # try adding the same label again, nothing should happen\n    artifact.labels.add(label, feature=experiment)\n    # check that the label is there, it's exactly one label with name \"Experiment 1\"\n    experiments = artifact.labels.get(experiment)\n    assert experiments.get().name == \"Experiment 1\"\n\n    # running from_values to load validated label records under the hood\n    experiment = ln.Feature(name=\"experiment_with_reg\", dtype=\"cat[Record]\").save()\n    ln.Record(name=\"Experiment 2\").save()\n    artifact.labels.add(\"Experiment 2\", experiment)\n    experiments = artifact.labels.get(experiment)\n    assert experiments.get().name == \"Experiment 2\"\n\n    # now, try adding a new label\n    project = ln.Record(name=\"project 1\").save()\n    ln.Feature(name=\"project\", dtype=ln.Record).save()\n    features = ln.Feature.lookup()\n    artifact.labels.add(project, feature=features.project)\n    # check that the label is there, it's exactly one label with name \"Experiment 1\"\n    projects = artifact.labels.get(features.project)\n    assert projects.get().name == \"project 1\"\n\n    # test add_from\n    adata2 = adata.copy()\n    adata2.uns[\"mutated\"] = True\n    artifact2 = ln.Artifact(adata2, description=\"My new artifact\").save()\n\n    artifact2.labels.add_from(artifact)\n    experiments = artifact2.labels.get(experiment)\n    assert experiments.get().name == \"Experiment 2\"\n\n    artifact2.delete(permanent=True)\n    artifact.delete(permanent=True)\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n    ln.Record.filter().delete(permanent=True)\n\n\ndef test_labels_add_using_anndata(adata):\n    organism = bt.Organism.from_source(name=\"mouse\")\n    cell_types = [bt.CellType(name=name) for name in adata.obs[\"cell_type\"].unique()]\n    ln.save(cell_types)\n    inspector = bt.CellType.inspect(adata.obs[\"cell_type_by_expert\"].unique())\n    ln.save([bt.CellType(name=name) for name in inspector.non_validated])\n    cell_types_from_expert = bt.CellType.from_values(\n        adata.obs[\"cell_type_by_expert\"].unique()\n    )\n    actual_tissues = [bt.Tissue(name=name) for name in adata.obs[\"tissue\"].unique()]\n    organoid = ln.Record(name=\"organoid\")\n    tissues = actual_tissues + [organoid]\n    ln.save(tissues)\n\n    # clean up DB state\n    organism_feature = ln.Feature.filter(name=\"organism\").one_or_none()\n    if organism_feature is not None:\n        organism_feature.delete(permanent=True)\n    artifact = ln.Artifact.filter(description=\"Mini adata\").one_or_none()\n    if artifact is not None:\n        artifact.delete(permanent=True, storage=True)\n    ln.Schema.filter().delete(permanent=True)\n\n    # try to construct without registering metadata features\n    artifact = ln.Artifact.from_anndata(adata, description=\"Mini adata\")\n    if not artifact._state.adding:\n        artifact.delete(permanent=True)  # make sure we get a fresh one\n        artifact = ln.Artifact.from_anndata(adata, description=\"Mini adata\")\n    # add feature set without saving file\n    feature_name_feature = ln.Feature(name=\"feature name\", dtype=\"cat[Record]\").save()\n    schema = ln.Schema(features=[feature_name_feature])\n    with pytest.raises(ValueError) as error:\n        artifact.features._add_schema(schema, slot=\"random\")\n    assert (\n        error.exconly()\n        == \"ValueError: Please save the artifact or collection before adding a feature\"\n        \" set!\"\n    )\n\n    # now register features we want to validate\n    # (we are not interested in cell_type_id, here)\n    ln.Feature(name=\"cell_type\", dtype=bt.CellType).save()\n    ln.Feature(name=\"disease\", dtype=ln.Record).save()\n    ln.Feature(name=\"cell_type_by_expert\", dtype=bt.CellType).save()\n    artifact = ln.Artifact.from_anndata(adata, description=\"Mini adata\")\n    ln.Feature(name=\"organism\", dtype=bt.Organism).save()\n    features = ln.Feature.lookup()\n    with pytest.raises(ValueError) as error:\n        artifact.labels.add(organism, feature=features.organism)\n    assert (\n        error.exconly()\n        == \"ValueError: Please save the artifact/collection before adding a label!\"\n    )\n    artifact.save()\n\n    # now, we add organism and run checks\n    features = ln.Feature.lookup()\n    with pytest.raises(ln.errors.ValidationError):\n        artifact.labels.add(organism, feature=features.organism)\n    organism.save()\n    artifact.labels.add(organism, feature=features.organism)\n    organism_link = artifact.links_organism.first()\n    assert organism_link.organism.name == \"mouse\"\n    assert organism_link.feature.name == \"organism\"\n    feature = ln.Feature.get(name=\"organism\")\n    assert feature._dtype_str == \"cat[bionty.Organism]\"\n\n    # now we add cell types & tissues and run checks\n    ln.Feature(name=\"cell_type\", dtype=bt.CellType).save()\n    ln.Feature(name=\"cell_type_by_expert\", dtype=bt.CellType).save()\n    add_labels(artifact, cell_types, feature=features.cell_type, from_curator=True)\n    add_labels(\n        artifact,\n        cell_types_from_expert,\n        feature=features.cell_type_by_expert,\n        from_curator=True,\n    )\n    feature_tissue_simple = ln.Feature(name=\"tissue_simple\", dtype=bt.Tissue).save()\n    with pytest.raises(ValidationError) as err:\n        add_labels(artifact, tissues, feature=feature_tissue_simple, from_curator=True)\n    assert (\n        err.exconly()\n        == \"lamindb.errors.ValidationError: Label type Record is not valid for Feature(name='tissue_simple', dtype='cat[bionty.Tissue]'), consider a feature with dtype='cat[bionty.Tissue|Record]'\"\n    )\n    tissue = ln.Feature(name=\"tissue\", dtype=\"cat[bionty.Tissue|Record]\").save()\n    add_labels(artifact, tissues, feature=tissue, from_curator=True)\n    feature = ln.Feature.get(name=\"cell_type\")\n    assert feature._dtype_str == \"cat[bionty.CellType]\"\n    feature = ln.Feature.get(name=\"cell_type_by_expert\")\n    assert feature._dtype_str == \"cat[bionty.CellType]\"\n    feature = ln.Feature.get(name=\"tissue\")\n    assert feature._dtype_str == \"cat[bionty.Tissue|Record]\"\n    diseases = [ln.Record(name=name) for name in adata.obs[\"disease\"].unique()]\n    ln.save(diseases)\n    add_labels(artifact, diseases, feature=features.disease, from_curator=True)\n\n    # now, let's add another feature to ext\n    experiment_1 = ln.Record(name=\"experiment_1\").save()\n    ln.Feature(name=\"experiment\", dtype=ln.Record).save()\n    features = ln.Feature.lookup()\n    artifact.labels.add(experiment_1, feature=features.experiment)\n\n    assert set(artifact.labels.get(features.experiment).to_list(\"name\")) == {\n        \"experiment_1\"\n    }\n    assert set(artifact.labels.get(features.disease).to_list(\"name\")) == {\n        \"chronic kidney disease\",\n        \"Alzheimer disease\",\n        \"liver lymphoma\",\n        \"cardiac ventricle disorder\",\n    }\n    assert set(artifact.labels.get(features.organism).to_list(\"name\")) == {\"mouse\"}\n    assert set(\n        artifact.labels.get(features.tissue)[\"bionty.Tissue\"].to_list(\"name\")\n    ) == {\n        \"liver\",\n        \"heart\",\n        \"kidney\",\n        \"brain\",\n    }\n    assert set(artifact.labels.get(features.tissue)[\"Record\"].to_list(\"name\")) == {\n        \"organoid\",\n    }\n    # currently, we can't stratify the two cases below\n    assert set(artifact.labels.get(features.cell_type).to_list(\"name\")) == {\n        \"T cell\",\n        \"my new cell type\",\n        \"hepatocyte\",\n        \"hematopoietic stem cell\",\n        \"B cell\",\n    }\n    assert set(artifact.labels.get(features.cell_type, flat_names=True)) == {\n        \"T cell\",\n        \"my new cell type\",\n        \"hepatocyte\",\n        \"hematopoietic stem cell\",\n        \"B cell\",\n    }\n    assert set(artifact.labels.get(features.cell_type_by_expert).to_list(\"name\")) == {\n        \"T cell\",\n        \"my new cell type\",\n        \"hepatocyte\",\n        \"hematopoietic stem cell\",\n        \"B cell\",\n    }\n    assert experiment_1 in artifact.records.all()\n\n    # call describe\n    artifact.describe()\n\n    # clean up\n    artifact.delete(permanent=True)\n    bt.Gene.filter().delete(permanent=True)\n    bt.Organism.filter().delete(permanent=True)\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n    bt.CellType.filter().delete(permanent=True)\n    bt.Tissue.filter().delete(permanent=True)\n    bt.Disease.filter().delete(permanent=True)\n    ln.Record.filter().delete(permanent=True)\n\n\ndef test_labels_get(get_mini_csv: Path):  # noqa: F811\n    artifact = ln.Artifact(get_mini_csv, description=\"test\")\n    # feature doesn't exist\n    with pytest.raises(TypeError):\n        artifact.labels.get(\"x\")  # type: ignore\n    # no linked labels\n    feature_name_feature = ln.Feature(name=\"feature name\", dtype=ln.ULabel).save()\n    schema = ln.Schema(features=[feature_name_feature]).save()\n    artifact.save()\n    # test for deprecated add_schema\n    artifact.features._add_schema(schema, slot=\"random\")\n    assert artifact.schemas.first() == schema\n    artifact.delete(permanent=True, storage=True)\n    schema.delete(permanent=True)\n    feature_name_feature.delete(permanent=True)\n\n\n@pytest.fixture\ndef get_test_artifacts():\n    with open(\"./default_storage_unit_core/test-inherit1\", \"w\") as f:\n        f.write(\"artifact1\")\n    with open(\"./default_storage_unit_core/test-inherit2\", \"w\") as f:\n        f.write(\"artifact2\")\n    artifact1 = ln.Artifact(\"./default_storage_unit_core/test-inherit1\")\n    artifact1.save()\n    artifact2 = ln.Artifact(\"./default_storage_unit_core/test-inherit2\")\n    artifact2.save()\n    yield artifact1, artifact2\n    artifact1.delete(permanent=True, storage=True)\n    artifact2.delete(permanent=True, storage=True)\n\n\ndef test_add_from(get_test_artifacts):\n    artifact1, artifact2 = get_test_artifacts\n    label_names = [f\"Project {i}\" for i in range(3)]\n    records = [ln.Record(name=label_name) for label_name in label_names]\n    ln.save(records)\n\n    cell_line_names = [f\"Cell line {i}\" for i in range(3)]\n    cell_lines = [bt.CellLine(name=name) for name in cell_line_names]\n    ln.save(cell_lines)\n\n    # pass a list of length 0\n    artifact2.labels.add([])\n    # now actually pass the labels\n    artifact2.labels.add(records)\n    # here test add without passing a feature\n    artifact2.labels.add(cell_lines)\n    assert artifact2.cell_lines.count() == len(cell_lines)\n\n    assert artifact1.records.exists() is False\n    artifact1.labels.add_from(artifact2)\n    assert artifact1.records.count() == artifact2.records.count()\n    assert artifact1.cell_lines.count() == artifact2.cell_lines.count()\n\n    artifact2.cell_lines.remove(*cell_lines)\n    artifact1.cell_lines.remove(*cell_lines)\n    artifact2.records.remove(*records)\n    artifact1.records.remove(*records)\n\n    for record in records:\n        record.delete(permanent=True)\n    for cell_line in cell_lines:\n        cell_line.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_load.py",
    "content": "from pathlib import Path\n\nimport anndata as ad\nimport lamindb as ln\nimport pandas as pd\nimport pytest\n\n# ruff: noqa: F811\nfrom _dataset_fixtures import get_small_mdata, get_small_sdata  # noqa\n\n\n@pytest.fixture(scope=\"module\")\ndef zip_file():\n    filepath = Path(\"test.zip\")\n    with open(filepath, \"w\") as f:\n        f.write(\"some\")\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"module\")\ndef html_filepath():\n    filepath = Path(\"./tmp.html\")\n    with open(filepath, \"w\") as f:\n        f.write(\"<html><body><h1>Test</h1></body></html>\")\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"module\")\ndef json_filepath():\n    filepath = Path(\"./tmp.json\")\n    with open(filepath, \"w\") as f:\n        f.write('{\"a\": 1}')\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"module\")\ndef csv_filepath():\n    filepath = Path(\"./tmp.csv\")\n    with open(filepath, \"w\") as f:\n        f.write(\"a,b\\n1,2\")\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"module\")\ndef tsv_filepath():\n    filepath = Path(\"./tmp.tsv\")\n    with open(filepath, \"w\") as f:\n        f.write(\"a\\tb\\n1\\t2\")\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"module\")\ndef parquet_filepath():\n    filepath = Path(\"./tmp.parquet\")\n    df = pd.DataFrame({\"a\": [1, 2], \"b\": [3, 4]})\n    df.to_parquet(filepath)\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"module\")\ndef yaml_filepath():\n    filepath = Path(\"./tmp.yaml\")\n    with open(filepath, \"w\") as f:\n        f.write(\"a: 1\\nb: 2\")\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"module\")\ndef image_filepath():\n    filepath = Path(\"./tmp.png\")\n    with open(filepath, \"w\") as f:\n        f.write(\"mock image\")\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"module\")\ndef svg_filepath():\n    filepath = Path(\"./tmp.svg\")\n    with open(filepath, \"w\") as f:\n        f.write(\"<svg><rect width='100' height='100'/></svg>\")\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"module\")\ndef rds_filepath():\n    filepath = Path(\"./tmp.rds\")\n    with open(filepath, \"w\") as f:\n        f.write(\"mock rds\")\n    yield filepath\n    filepath.unlink()\n\n\n@pytest.fixture(scope=\"module\")\ndef local_anndata_filepath():\n    return ln.examples.datasets.anndata_file_pbmc68k_test().resolve()\n\n\n@pytest.fixture(scope=\"module\")\ndef adata(local_anndata_filepath):\n    return ad.read_h5ad(local_anndata_filepath)\n\n\ndef test_load_anndata(local_anndata_filepath, adata):\n    artifact = ln.Artifact(local_anndata_filepath, description=\"test\")\n    assert local_anndata_filepath == artifact._local_filepath\n    assert local_anndata_filepath == artifact.path\n    assert local_anndata_filepath == artifact.cache()\n\n    artifact = ln.Artifact.from_anndata(adata, description=\"test\")\n    assert artifact._memory_rep is adata\n    assert artifact.load() is adata\n    assert artifact._local_filepath.resolve() == artifact.cache() == artifact.path\n\n\ndef test_load_mudata(get_small_mdata):\n    artifact = ln.Artifact.from_mudata(get_small_mdata, description=\"test\")\n    assert artifact._memory_rep is get_small_mdata\n    assert artifact.load() is get_small_mdata\n    assert artifact._local_filepath.resolve() == artifact.cache() == artifact.path\n\n\ndef test_load_spatialdata(get_small_sdata):\n    artifact = ln.Artifact.from_spatialdata(get_small_sdata, description=\"test\")\n    assert artifact._memory_rep is get_small_sdata\n    assert artifact.load() is get_small_sdata\n    assert artifact._local_filepath.resolve() == artifact.cache() == artifact.path\n\n\ndef load_blobs__repr__():\n    example_blobs_sdata = ln.examples.datasets.spatialdata_blobs()\n    blobs_af = ln.Artifact.from_spatialdata(\n        example_blobs_sdata, key=\"example_blobs.zarr\"\n    ).save()\n    example_blobs_sdata = blobs_af.load()\n    # Must exist and not throw errors\n    assert example_blobs_sdata.__repr__\n\n\ndef test_load_html(html_filepath):\n    artifact = ln.Artifact(html_filepath, key=str(html_filepath))\n    artifact.load()\n\n\ndef test_load_json(json_filepath):\n    artifact = ln.Artifact(json_filepath, key=str(json_filepath))\n    dictionary = artifact.load()\n    assert dictionary[\"a\"] == 1\n\n\ndef test_no_loader(zip_file):\n    artifact = ln.Artifact(zip_file, key=str(zip_file))\n    with pytest.raises(NotImplementedError):\n        artifact.load()\n\n\ndef test_load_csv(csv_filepath):\n    artifact = ln.Artifact(csv_filepath, key=str(csv_filepath))\n    df = artifact.load()\n    assert df.iloc[0, 0] == 1\n    assert df.iloc[0, 1] == 2\n\n\ndef test_load_tsv(tsv_filepath):\n    artifact = ln.Artifact(tsv_filepath, key=str(tsv_filepath))\n    df = artifact.load()\n    assert df.iloc[0, 0] == 1\n    assert df.iloc[0, 1] == 2\n\n\ndef test_load_parquet(parquet_filepath):\n    artifact = ln.Artifact(parquet_filepath, key=str(parquet_filepath))\n    df = artifact.load()\n    assert df.iloc[0, 0] == 1\n    assert df.iloc[1, 1] == 4\n\n\ndef test_load_yaml(yaml_filepath):\n    artifact = ln.Artifact(yaml_filepath, key=str(yaml_filepath))\n    data = artifact.load()\n    assert data[\"a\"] == 1\n    assert data[\"b\"] == 2\n\n\ndef test_load_image(image_filepath):\n    artifact = ln.Artifact(image_filepath, key=str(image_filepath))\n    result = artifact.load()\n    assert Path(result).name == image_filepath.name\n\n\ndef test_load_svg(svg_filepath):\n    artifact = ln.Artifact(svg_filepath, key=str(svg_filepath))\n    result = artifact.load()\n    assert Path(result).name == svg_filepath.name\n\n\ndef test_load_rds(rds_filepath, ccaplog):\n    artifact = ln.Artifact(rds_filepath, key=str(rds_filepath))\n    result = artifact.load()\n    assert \"Please use `laminr` to load `.rds` files\" in ccaplog.text\n    assert Path(result).name == rds_filepath.name\n"
  },
  {
    "path": "tests/core/test_manager.py",
    "content": "import lamindb as ln\n\n\ndef test_manager_list():\n    label = ln.Record(name=\"manager label\")\n    label.save()\n    label_names = [f\"Record {i}\" for i in range(3)]\n    labels = [ln.Record(name=name) for name in label_names]\n    ln.save(labels)\n    label.parents.set(labels)\n    assert len(label.parents.to_list()) == 3\n    assert \"Record 1\" in label.parents.to_list(\"name\")\n    label.delete(permanent=True)\n    for label in labels:\n        label.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_merge.py",
    "content": "\"\"\"Tests for ln.setup.merge.\"\"\"\n\nimport lamindb as ln\nimport pytest\n\n\ndef test_merge_branch_into_main():\n    \"\"\"Merge a branch into main: create branch, add ULabel, switch to main, merge.\"\"\"\n    branch = ln.Branch(name=\"test_merge_branch\").save()\n    assert branch.status == \"standalone\"\n    ln.setup.switch(branch.name)\n    assert ln.setup.settings.branch == branch\n    assert ln.setup.settings.branch.name == \"test_merge_branch\"\n\n    ulabel = ln.ULabel(name=\"test_merge_record\").save()\n    assert ulabel.branch == branch\n    assert ulabel.created_on == branch  # created_on set to creation branch\n\n    ln.setup.switch(\"main\")\n    assert ln.setup.settings.branch.name == \"main\"\n    assert ln.setup.settings.branch.status == \"standalone\"\n    assert ln.ULabel.filter(name=\"test_merge_record\").count() == 0\n\n    ln.setup.merge(\"test_merge_branch\")\n    assert ln.ULabel.filter(name=\"test_merge_record\").count() == 1\n    ulabel = ln.ULabel.get(name=\"test_merge_record\")\n    assert ulabel.branch.name == \"main\"\n    # created_on still points to the branch on which the record was created\n    assert ulabel.created_on == branch\n    assert ulabel.created_on.name == \"test_merge_branch\"\n    # merged branch has status \"merged\"\n    branch.refresh_from_db()\n    assert branch.status == \"merged\"\n    # this is a merge call to check that branch.describe() works because it\n    # has a custom describe method\n    branch.describe(return_str=True)\n\n    # Clean up\n    ulabel.delete(permanent=True)\n    branch.delete(permanent=True)\n    ln.setup.switch(\"main\")\n\n\ndef test_branch_status_values():\n    \"\"\"Branch status maps codes onto standalone/draft/review/merged/closed.\"\"\"\n    main_branch = ln.Branch.get(name=\"main\")\n    assert main_branch.status == \"standalone\"\n    archive_branch = ln.Branch.get(name=\"archive\")\n    assert archive_branch.status == \"standalone\"\n    trash_branch = ln.Branch.get(name=\"trash\")\n    assert trash_branch.status == \"standalone\"\n    # User-created branch is standalone by default.\n    branch = ln.Branch(name=\"test_status_branch\").save()\n    assert branch.status == \"standalone\"\n    branch.status = \"draft\"\n    branch.save()\n    branch.refresh_from_db()\n    assert branch.status == \"draft\"\n    branch.status = \"review\"\n    branch.save()\n    branch.refresh_from_db()\n    assert branch.status == \"review\"\n    branch.status = \"closed\"\n    branch.save()\n    branch.refresh_from_db()\n    assert branch.status == \"closed\"\n    branch.delete(permanent=True)\n\n\ndef test_draft_review_and_close_merge_request_status():\n    branch = ln.Branch(name=\"test_mr_draft_review_close\").save()\n    assert branch.status == \"standalone\"\n\n    branch.status = \"draft\"\n    branch.save()\n    branch.refresh_from_db()\n    assert branch.status == \"draft\"\n\n    branch.status = \"review\"\n    branch.save()\n    branch.refresh_from_db()\n    assert branch.status == \"review\"\n\n    branch.status = \"closed\"\n    branch.save()\n    branch.refresh_from_db()\n    assert branch.status == \"closed\"\n\n    branch.delete(permanent=True)\n\n\ndef test_merge_nonexistent_branch_raises():\n    \"\"\"Merge a non-existent branch raises ObjectDoesNotExist.\"\"\"\n    with pytest.raises(ln.errors.ObjectDoesNotExist) as exc_info:\n        ln.setup.merge(\"nonexistent_branch_xyz\")\n    assert \"not found\" in str(exc_info.value).lower()\n\n\ndef test_merge_reconciles_is_latest_for_versioned_records():\n    main_branch = ln.Branch.get(name=\"main\")\n    ln.setup.switch(main_branch.name)\n\n    transform_v1 = ln.Transform(\n        key=\"test-merge-is-latest\",\n        source_code=\"main-v1\",\n        kind=\"pipeline\",\n    ).save()\n    branch = ln.Branch(name=\"test_merge_latest_branch\").save()\n    ln.setup.switch(branch.name)\n    transform_v2 = ln.Transform(\n        key=\"test-merge-is-latest\",\n        revises=transform_v1,\n        source_code=\"feature-v2\",\n        kind=\"pipeline\",\n    ).save()\n    transform_v1.refresh_from_db()\n    assert transform_v1.is_latest\n    assert transform_v2.is_latest\n\n    ln.setup.switch(main_branch.name)\n    ln.setup.merge(branch.name)\n\n    family = ln.Transform.objects.filter(\n        uid__startswith=transform_v1.uid[:-4], branch_id=1\n    )\n    assert family.filter(is_latest=True).count() == 1\n    assert family.get(is_latest=True).uid == transform_v2.uid\n\n    for record in family:\n        record.delete(permanent=True)\n    branch.delete(permanent=True)\n\n\ndef test_merge_updates_recordblock_branch():\n    main_branch = ln.Branch.get(name=\"main\")\n    ln.setup.switch(main_branch.name)\n\n    source_branch = ln.Branch(name=\"test_merge_recordblock_branch\").save()\n    ln.setup.switch(source_branch.name)\n    record = ln.Record(name=\"recordblock-merge-record\").save()\n    block = ln.models.RecordBlock(\n        record=record,\n        content=\"recordblock merge content\",\n        kind=\"readme\",\n        branch=source_branch,\n        created_on=source_branch,\n    ).save()\n    assert block.branch == source_branch\n    assert block.created_on == source_branch\n\n    ln.setup.switch(main_branch.name)\n    ln.setup.merge(source_branch.name)\n\n    block.refresh_from_db()\n    assert block.branch.name == \"main\"\n    assert block.created_on == source_branch\n\n    record.delete(permanent=True)\n    source_branch.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_nbconvert.py",
    "content": "import os\n\n\ndef test_nbconvert():\n    exit_code = os.system(  # noqa: S605\n        \"jupyter nbconvert --to notebook --inplace --execute ./tests/core/notebooks/load_schema.ipynb\"\n    )\n    assert exit_code == 0\n"
  },
  {
    "path": "tests/core/test_notebooks.py",
    "content": "import os\nimport subprocess\nfrom pathlib import Path\n\nimport lamindb as ln\nimport nbproject_test\n\nnotebook_dir = Path(__file__).parent / \"notebooks/\"\nnotebook_dir_duplicate = Path(__file__).parent / \"notebooks/duplicate/\"\n\n\ndef test_all_notebooks():\n    nbproject_test.execute_notebooks(notebook_dir)\n    nbproject_test.execute_notebooks(notebook_dir_duplicate)\n\n\ndef test_run_after_rename_no_uid():\n    notebook_path = (\n        notebook_dir / \"with-title-initialized-consecutive-finish-not-last-cell.ipynb\"\n    )\n    result = subprocess.run(  # noqa: S602\n        f\"jupyter nbconvert --to notebook --inplace --execute {notebook_path}\",\n        shell=True,\n        capture_output=True,\n    )\n    print(result.stdout.decode())\n    print(result.stderr.decode())\n    assert result.returncode == 0\n\n    uid = ln.Transform.get(\n        key=\"with-title-initialized-consecutive-finish-not-last-cell.ipynb\"\n    ).uid\n\n    # now, assume the user renames the notebook\n    new_path = notebook_path.with_name(\"no-uid-renamed.ipynb\")\n    os.system(f\"cp {notebook_path} {new_path}\")  # noqa: S605\n\n    result = subprocess.run(  # noqa: S602\n        f\"jupyter nbconvert --to notebook --inplace --execute {new_path}\",\n        shell=True,\n        capture_output=True,\n    )\n    print(result.stdout.decode())\n    print(result.stderr.decode())\n    assert result.returncode == 0\n\n    assert ln.Transform.get(key=\"no-uid-renamed.ipynb\").uid == uid\n\n    # new_path.unlink()\n"
  },
  {
    "path": "tests/core/test_querydb.py",
    "content": "import lamindb as ln\nimport pytest\n\n\ndef test_DB_multiple_instances():\n    \"\"\"Accessing multiple instances simultaneously must work.\"\"\"\n    cxg_db = ln.DB(\"laminlabs/cellxgene\")\n    lamindata_db = ln.DB(\"laminlabs/lamindata\")\n    qs1 = cxg_db.Artifact.filter(suffix=\".h5ad\")\n    qs2 = lamindata_db.Artifact.filter(suffix=\".zarr\")\n    assert qs1._db != qs2._db\n\n\ndef test_DB_bionty():\n    \"\"\"Querying a record from bionty must work.\"\"\"\n    cxg_db = ln.DB(\"laminlabs/cellxgene\")\n    assert len(cxg_db.bionty.Gene.filter(symbol__startswith=\"TP53\")) > 0\n\n\ndef test_DB_missing_module():\n    \"\"\"Attempting to access an attribute that comes from a missing module must error.\"\"\"\n    site_assets_db = ln.DB(\"laminlabs/lamin-site-assets\")  # instance without bionty\n\n    with pytest.raises(AttributeError) as e:\n        site_assets_db.bionty.Gene.first()\n\n    assert (\n        \"Schema 'bionty' not available in instance 'laminlabs/lamin-site-assets'.\"\n        in str(e.value)\n    )\n\n\ndef test_DB_instantiate_class():\n    \"\"\"Attempting to instantiate a class must error.\"\"\"\n    cxg_db = ln.DB(\"laminlabs/cellxgene\")\n    with pytest.raises(TypeError) as e:\n        cxg_db.Artifact()\n    assert (\n        \"Cannot instantiate Artifact from DB. Use Artifact.filter(), Artifact.get(), etc. to query records.\"\n        in str(e.value)\n    )\n\n\n@pytest.mark.parametrize(\n    \"attr,expected_msg\",\n    [\n        (\"artifacts\", \"Registry 'artifacts' not found\"),\n        (\"foo\", \"Registry 'foo' not found\"),\n        (\"celltype\", \"Registry 'celltype' not found\"),\n    ],\n)\ndef test_DB_rejects_invalid_attributes(attr, expected_msg):\n    \"\"\"Accessing invalid attributes must fail.\"\"\"\n    cxg_db = ln.DB(\"laminlabs/cellxgene\")\n    with pytest.raises(AttributeError) as e:\n        getattr(cxg_db, attr)\n    assert expected_msg in str(e.value)\n\n\ndef test_DB_cache():\n    \"\"\"Subsequent accesses must return cached wrapper.\"\"\"\n    cxg_db = ln.DB(\"laminlabs/cellxgene\")\n    artifact1 = cxg_db.Artifact\n    artifact2 = cxg_db.Artifact\n    assert artifact1 is artifact2\n\n\ndef test_queryset_caching():\n    \"\"\"Calling `.filter()` multiple times should return different results.\"\"\"\n    cxg_db = ln.DB(\"laminlabs/cellxgene\")\n    res_1 = cxg_db.Artifact.filter().first()\n    res_2 = cxg_db.Artifact.filter().last()\n\n    assert res_1 != res_2\n\n\ndef test_DB_dir():\n    \"\"\"__dir__ must return discovered registries.\"\"\"\n    cxg = ln.DB(\"laminlabs/cellxgene\")\n    dir_result = dir(cxg)\n    assert \"Artifact\" in dir_result\n    assert \"Collection\" in dir_result\n    assert \"Gene\" not in dir_result\n    assert \"bionty\" in dir_result\n"
  },
  {
    "path": "tests/core/test_queryset.py",
    "content": "import re\nimport textwrap\nfrom contextlib import contextmanager\n\nimport bionty as bt\nimport lamindb as ln\nimport pytest\nfrom django.core.exceptions import FieldError\nfrom lamindb.base.users import current_user_id\nfrom lamindb.errors import InvalidArgument\nfrom lamindb.models import ArtifactSet, BasicQuerySet, QuerySet\n\n\n# please also see the test_curate_df.py tests\ndef test_to_dataframe():\n    project_label = ln.Record(name=\"project\").save()\n    project_names = [f\"Project {i}\" for i in range(3)]\n    labels = ln.Record.from_values(project_names, create=True).save()\n    project_label.children.add(*labels)\n    df = ln.Record.to_dataframe(include=\"parents__name\")\n    assert df.columns[2] == \"parents__name\"\n    assert df[\"parents__name\"].iloc[0] == {project_label.name}\n    df = ln.Record.to_dataframe(include=[\"parents__name\", \"parents__created_by_id\"])\n    assert df.columns[3] == \"parents__created_by_id\"\n    assert df[\"parents__name\"].iloc[0] == {project_label.name}\n    assert set(df[\"parents__created_by_id\"].iloc[0]) == {current_user_id()}\n\n    # for other models\n    feature_names = [f\"Feature {i}\" for i in range(3)]\n    features = [ln.Feature(name=name, dtype=int) for name in feature_names]\n    ln.save(features)\n    schema = ln.Schema(features, name=\"my schema\").save()\n    schema.features.set(features)\n\n    df = ln.Schema.filter(name=\"my schema\").to_dataframe(include=\"features__name\")\n    assert df.columns[2] == \"features__name\"\n    # order is not conserved\n    assert set(df[\"features__name\"].iloc[0]) == set(feature_names)\n    # pass a list\n    df = ln.Schema.filter(name=\"my schema\").to_dataframe(\n        include=[\"features__name\", \"features__created_by_id\"]\n    )\n    assert df.columns[3] == \"features__created_by_id\"\n    assert set(df[\"features__name\"].iloc[0]) == set(feature_names)\n    assert set(df[\"features__created_by_id\"].iloc[0]) == {current_user_id()}\n\n    # inner join parents on features\n    df = ln.Schema.filter().to_dataframe(\n        include=[\"features__name\", \"features__created_by_id\"]\n    )\n    assert set(df[\"features__name\"].iloc[0]) == set(feature_names)\n    assert set(df[\"features__created_by_id\"].iloc[0]) == {current_user_id()}\n\n    # raise error for non many-to-many\n    df = ln.Record.filter(name=\"Project 0\").to_dataframe(include=\"created_by__name\")\n    assert df[\"created_by__name\"].iloc[0] == ln.setup.settings.user.name\n\n    # do not return fields with no data in the registry\n    # does not make sense in Alex's opinion\n    # too much magic; got removed in https://github.com/laminlabs/lamindb/pull/2238\n    # df = (\n    #     ln.Artifact.connect(\"laminlabs/cellxgene\")\n    #     .filter(suffix=\".h5ad\")\n    #     .to_dataframe(include=[\"tissues__name\", \"pathways__name\"])\n    # )\n    # assert \"tissues__name\" in df.columns\n    # assert \"pathways__name\" not in df.columns\n    # assert df.shape[0] > 0\n\n    # clean up\n    project_label.delete(permanent=True)\n    for label in labels:\n        label.delete(permanent=True)\n\n    schema.delete(permanent=True)\n    for feature in features:\n        feature.delete(permanent=True)\n\n    # call it from a non-select-derived queryset\n    qs = ln.User.objects.all()\n    assert qs.to_dataframe().iloc[0][\"handle\"] == ln.setup.settings.user.handle\n\n\ndef test_complex_df_with_features():\n    # should not fail\n    ln.Artifact.connect(\"laminlabs/lamindata\").to_dataframe(include=\"features\")\n    ln.Run.connect(\"laminlabs/lamindata\").to_dataframe(include=\"features\")\n    ln.Artifact.connect(\"laminlabs/lamindata\").to_dataframe(features=\"queryset\")\n\n\ndef test_run_to_dataframe_includes_json_features():\n    transform = ln.Transform(key=\"test_run_to_dataframe_includes_json_features\").save()\n    run = ln.Run(transform=transform).save()\n    feature = ln.Feature(name=\"run_json_feature\", dtype=str).save()\n\n    run.features.set_values({\"run_json_feature\": \"hello\"})\n    df = ln.Run.filter(id=run.id).to_dataframe(include=\"features\")\n\n    assert \"run_json_feature\" in df.columns\n    assert df[\"run_json_feature\"].iloc[0] == \"hello\"\n\n    run.delete(permanent=True)\n    transform.delete(permanent=True)\n    feature.delete(permanent=True)\n\n\ndef test_one_first():\n    qs = ln.User.objects.all()\n    assert qs.one().handle == ln.setup.settings.user.handle\n    assert qs.first().handle == ln.setup.settings.user.handle\n    assert qs.one_or_none().handle == ln.setup.settings.user.handle\n\n    description = textwrap.dedent(\"\"\"\\\n    User\n      Simple fields\n    \"\"\").strip()\n    assert qs.describe(return_str=True).startswith(description)\n\n    qs = ln.User.filter(handle=\"test\")\n    with pytest.raises(ln.errors.ObjectDoesNotExist):\n        qs.one()\n    qs = bt.Source.filter()\n    with pytest.raises(ln.errors.MultipleObjectsReturned):\n        qs.one()\n    with pytest.raises(ln.errors.MultipleObjectsReturned):\n        qs.one_or_none()\n\n\ndef test_filter_related_field_name():\n    with pytest.raises(\n        FieldError,\n        match=re.escape(\n            \"Invalid lookup 'somelabel' for records. Did you mean records__name?\"\n        ),\n    ):\n        ln.Artifact.filter(records=\"somelabel\")\n\n\ndef test_filter_unknown_field():\n    with pytest.raises(InvalidArgument) as e:\n        ln.Artifact.filter(nonexistent=\"value\")\n    assert \"You can query either by available fields\" in str(e)\n\n\ndef test_filter_status_field():\n    transform = ln.Transform(key=\"test_filter_status_field\").save()\n    run = ln.Run(transform).save()\n    run._status_code = 0\n    run.save(update_fields=[\"_status_code\"])\n    assert ln.Run.filter(status=\"completed\").count() >= 1\n\n    branch = ln.Branch(name=\"test_filter_status_branch\").save()\n    branch.status = \"review\"\n    branch.save()\n    assert ln.Branch.filter(status=\"review\").count() >= 1\n\n    project = ln.Project(name=\"test_filter_status_project\").save()\n    project._status_code = 2\n    project.save(update_fields=[\"_status_code\"])\n    assert ln.Project.filter(status=2).count() >= 1\n\n    run.delete(permanent=True)\n    transform.delete(permanent=True)\n    project.delete(permanent=True)\n    branch.delete()\n\n\ndef test_get_id_type_error():\n    with pytest.raises(\n        ValueError, match=re.escape(\"Field 'id' expected a number but got 'abc'.\")\n    ):\n        ln.Artifact.get(id=\"abc\")\n\n\ndef test_get_related_field_name():\n    with pytest.raises(\n        FieldError,\n        match=re.escape(\n            \"Invalid lookup 'somelabel' for records. Did you mean records__name?\"\n        ),\n    ):\n        ln.Artifact.get(records=\"somelabel\")\n\n\ndef test_get_unknown_field():\n    with pytest.raises(FieldError) as e:\n        ln.Artifact.get(nonexistent=\"value\")\n    assert \"Unknown field 'nonexistent'. Available fields:\" in str(e)\n\n\ndef test_search():\n    label_names = [f\"Record {i}\" for i in range(3)]\n    labels = [ln.Record(name=name) for name in label_names]\n    ln.save(labels)\n    qs = ln.Record.filter(name__startswith=\"Record\")\n    assert qs.search(\"Record 1\")[0].name == \"Record 1\"\n    assert qs.search(\"Record 1\", field=ln.Record.name)[0].name == \"Record 1\"\n    for label in labels:\n        label.delete(permanent=True)\n\n\ndef test_lookup():\n    qs = ln.User.filter(handle=\"testuser1\")\n    # pass str to field\n    lookup = qs.lookup(field=\"handle\")\n    assert lookup.testuser1.handle == \"testuser1\"\n    # pass StrField to field\n    lookup = qs.lookup(field=ln.User.handle)\n    assert lookup.testuser1.handle == \"testuser1\"\n    # manager, default field\n    qsm = ln.User.filter(handle=\"testuser1\")\n    lookup = qsm.lookup()\n    assert lookup.testuser1.handle == \"testuser1\"\n\n\ndef test_inspect():\n    qs = ln.User.filter(handle=\"testuser1\")\n    assert qs.inspect([\"user1\", \"user2\"], \"name\")[\"validated\"] == []\n    assert ln.User.inspect([\"user1\", \"user2\"], \"name\")[\"validated\"] == []\n    assert ln.User.inspect([\"user1\", \"user2\"], ln.User.name)[\"validated\"] == []\n    assert ln.User.inspect(\"user1\", \"name\")[\"validated\"] == []\n\n\ndef test_validate():\n    qs = ln.User.filter(handle=\"testuser1\")\n    assert qs.validate([\"testuser1\", \"Test User1\"], \"handle\").tolist() == [True, False]\n    assert ln.User.validate([\"testuser1\", \"Test User1\"], \"handle\").tolist() == [\n        True,\n        False,\n    ]\n    assert ln.User.validate([\"testuser1\", \"Test User1\"], ln.User.handle).tolist() == [\n        True,\n        False,\n    ]\n    # returns True\n    assert ln.User.validate(\"testuser1\", ln.User.handle)\n\n\ndef test_standardize():\n    qs = ln.User.filter(handle=\"testuser1\")\n    assert qs.standardize([\"user1\", \"user2\"]) == [\"user1\", \"user2\"]\n\n\ndef test_get_doesnotexist_error():\n    non_existent_label = \"some-label-name\"\n\n    with pytest.raises(ln.errors.ObjectDoesNotExist) as excinfo:\n        ln.Record.get(non_existent_label)\n\n    error_message = str(excinfo.value)\n    assert f\"No record found with uid '{non_existent_label}'\" in error_message\n    assert (\n        f\"Did you forget a keyword as in Record.get(name='{non_existent_label}')?\"\n        in error_message\n    )\n\n\n@contextmanager\ndef set_branch(branch: ln.Branch):\n    try:\n        ln.setup.settings.branch = branch\n        yield branch\n    finally:\n        ln.setup.settings._branch = None\n        ln.setup.settings._branch_path.unlink(missing_ok=True)\n\n\ndef test_get_filter_branch():\n    branch = ln.Branch(name=\"test_branch\").save()\n\n    artifact = ln.Artifact.from_dataframe(\n        ln.User.to_dataframe(), key=\"df_test_get.parquet\"\n    )\n    artifact.branch = branch\n    artifact.save()\n\n    # switch to branch \"test_branch\"\n    with set_branch(branch):\n        # errors if doesn't find or multiple records found\n        ln.Artifact.get(key=\"df_test_get.parquet\")\n        assert ln.Artifact.filter(key=\"df_test_get.parquet\").count() == 1\n\n    # back to main branch\n    with pytest.raises(ln.errors.ObjectDoesNotExist):\n        ln.Artifact.get(key=\"df_test_get.parquet\")\n    assert ln.Artifact.filter(key=\"df_test_get.parquet\").count() == 0\n    # test by passing branch directly\n    assert (\n        ln.Artifact.filter(\n            branch=branch,\n            key=\"df_test_get.parquet\",\n        ).count()\n        == 1\n    )\n    assert (\n        ln.Artifact.filter(branch_id=branch.id, key=\"df_test_get.parquet\").count() == 1\n    )\n    assert (\n        ln.Artifact.filter(ln.Q(branch=branch), key=\"df_test_get.parquet\").count() == 1\n    )\n    assert (\n        ln.Artifact.filter(ln.Q(branch_id=branch.id), key=\"df_test_get.parquet\").count()\n        == 1\n    )\n\n    # errors if doesn't find or multiple records found\n    ln.Artifact.get(key=\"df_test_get.parquet\", branch=branch)\n    ln.Artifact.get(key=\"df_test_get.parquet\", branch_id=branch.id)\n    ln.Artifact.get(key=\"df_test_get.parquet\", branch__in=[branch])\n    ln.Artifact.get(key=\"df_test_get.parquet\", branch_id__in=[branch.id])\n    ln.Artifact.get(key=\"df_test_get.parquet\", branch=None)\n    ln.Artifact.get(key=\"df_test_get.parquet\", branch_id=None)\n\n    ln.Artifact.get(artifact.id)\n    ln.Artifact.get(id=artifact.id)\n    ln.Artifact.get(id__in=[artifact.id])\n\n    ln.Artifact.get(artifact.uid[:5])\n    ln.Artifact.get(uid=artifact.uid)\n    ln.Artifact.get(uid__in=[artifact.uid])\n\n    ln.Artifact.get(hash=artifact.hash)\n    ln.Artifact.get(hash__in=[artifact.hash])\n\n    artifact.delete(permanent=True)\n    branch.delete()\n\n\ndef test_to_class():\n    qs = ln.Artifact.filter()\n    assert isinstance(qs, QuerySet)\n    assert isinstance(qs, ArtifactSet)\n\n    qs_copy = qs._to_non_basic(copy=True)\n    assert isinstance(qs_copy, QuerySet)\n    assert isinstance(qs_copy, ArtifactSet)\n\n    qs_basic = qs._to_basic(copy=True)\n    assert isinstance(qs_basic, BasicQuerySet)\n    assert isinstance(qs_basic, ArtifactSet)\n    assert not isinstance(qs_basic, QuerySet)\n\n    qs_basic._to_non_basic(copy=False)\n    assert isinstance(qs_basic, QuerySet)\n    assert isinstance(qs_basic, ArtifactSet)\n\n\ndef test_queryset_soft_delete_error():\n    with pytest.raises(ValueError):\n        ln.Storage.filter().delete(permanent=False)\n\n    with pytest.raises(ValueError):\n        ln.Branch.filter().delete(permanent=False)\n\n\ndef test_encode_lamindb_fields_as_columns():\n    from lamindb.models.query_set import encode_lamindb_fields_as_columns\n\n    assert encode_lamindb_fields_as_columns(\n        ln.Artifact, [\"uid\", \"name\", \"created_by\", \"key\", \"tissues\"]\n    ) == {\n        \"uid\": \"__lamindb_artifact_uid__\",\n        \"created_by\": \"__lamindb_artifact_created_by__\",\n        \"key\": \"__lamindb_artifact_key__\",\n    }\n    assert encode_lamindb_fields_as_columns(\n        ln.Record, [\"uid\", \"name\", \"created_by\", \"key\", \"tissues\"]\n    ) == {\n        \"uid\": \"__lamindb_record_uid__\",\n        \"name\": \"__lamindb_record_name__\",\n        \"created_by\": \"__lamindb_record_created_by__\",\n    }\n\n\n# def test_connect_public_clone_instance():\n#     # become an anonymous user\n#     ln_setup.logout()\n\n#     try:\n#         from django.db import connections\n\n#         connections.databases.pop(\"laminlabs/arc-virtual-cell-atlas\", None)\n\n#         qs = ln.Artifact.connect(\"laminlabs/arc-virtual-cell-atlas\")\n\n#         assert qs.db == \"laminlabs/arc-virtual-cell-atlas\"\n\n#         # Verify the connection is SQLite, not Postgres\n#         assert (\n#             \"sqlite\"\n#             in connections.databases[\"laminlabs/arc-virtual-cell-atlas\"][\"ENGINE\"]\n#         )\n\n#         # Verify we can actually query it\n#         result = qs.filter().first()\n#         assert result is not None\n#     finally:\n#         # log back in to ensure that other tests do not break\n#         login_testuser2(session=None)\n#         login_testuser1(session=None)\n#         ln_setup.connect(\"lamindb-unit-tests-core\")\n"
  },
  {
    "path": "tests/core/test_record_basics.py",
    "content": "import os\nimport re\nfrom datetime import date, datetime\n\nimport bionty as bt\nimport lamindb as ln\nimport pandas as pd\nimport pytest\nfrom django.db import IntegrityError\nfrom lamindb.errors import FieldValidationError\nfrom lamindb.models.record import IMPORTS_UID, SCHEMA_IMPORTS_UID\n\n\ndef test_record_docstring_examples():\n    # create a feature if you don't yet have one\n    gc_content = ln.Feature(name=\"gc_content\", dtype=float).save()\n\n    # create a record to track a sample\n    sample1 = ln.Record(name=\"Sample 1\", features={\"gc_content\": 0.5}).save()\n\n    # describe the record\n    sample1.describe()\n\n    # create a flexible record type to track experiments\n    experiment_type = ln.Record(name=\"Experiment\", is_type=True).save()\n    experiment1 = ln.Record(name=\"Experiment 1\", type=experiment_type).save()\n\n    # create a feature to link experiments\n    experiment = ln.Feature(name=\"experiment\", dtype=experiment_type).save()\n\n    # create a record type to track samples that's constrained with a schema\n    schema = ln.Schema(\n        [experiment, gc_content.with_config(optional=True)], name=\"sample_schema\"\n    ).save()\n    sample_sheet = ln.Record(name=\"Sample Sheet\", is_type=True, schema=schema).save()\n\n    # group the sample1 record under the sample sheet\n    sample1.type = sample_sheet\n    sample1.save()\n\n    # reset the feature values for the record including the experiment\n    sample1.features.set_values(\n        {\n            \"gc_content\": 0.5,\n            \"experiment\": \"Experiment 1\",  # automatically resolves by name, also accepts the experiment1 object\n        }\n    )\n\n    # Export all records under a type to a dataframe\n    df = experiment_type.to_dataframe()\n    assert \"Experiment 1\" in df[\"__lamindb_record_name__\"].values\n\n    # If you try to set incomplete features in a record in a sheet, you'll get a validation error\n    sample2 = ln.Record(name=\"Sample 2\", type=sample_sheet).save()\n    with pytest.raises(ln.errors.ValidationError):\n        sample2.features.set_values({\"gc_content\": 0.6})\n\n    # Query records by features\n    assert ln.Record.filter(gc_content=0.5).one() == sample1\n    assert ln.Record.filter(gc_content__gt=0.4).one() == sample1\n    assert ln.Record.filter(type=sample_sheet).count() >= 1\n\n    # Clean up\n    sample1.delete(permanent=True)\n    sample2.delete(permanent=True)\n    experiment1.delete(permanent=True)\n    sample_sheet.delete(permanent=True)\n    schema.delete(permanent=True)\n    experiment_type.delete(permanent=True)\n    gc_content.delete(permanent=True)\n    experiment.delete(permanent=True)\n\n\ndef test_record_initialization():\n    with pytest.raises(\n        FieldValidationError,\n        match=re.escape(\n            \"Only name, type, is_type, features, description, schema, reference, reference_type are valid keyword arguments\"\n        ),\n    ):\n        ln.Record(x=1)\n\n    with pytest.raises(ValueError) as error:\n        ln.Record(1)\n    assert error.exconly() == \"ValueError: Only one non-keyword arg allowed\"\n\n\ndef test_record_lazy_features_on_save():\n    score_feature = ln.Feature(name=\"lazy_score\", dtype=float).save()\n    record = ln.Record(name=\"lazy-record\", features={\"lazy_score\": 0.7}).save()\n\n    assert not hasattr(record, \"_features\")\n    assert ln.Record.filter(lazy_score=0.7).one().id == record.id\n\n    record.delete(permanent=True)\n    score_feature.delete(permanent=True)\n\n\ndef test_record_from_dataframe_bulk_save_paths():\n    score = ln.Feature(name=\"from-df-score\", dtype=float).save()\n    schema = ln.Schema([score], name=\"from-df-schema\").save()\n    sheet = ln.Record(name=\"from-df-sheet\", is_type=True, schema=schema).save()\n    df = pd.DataFrame(\n        {\n            \"__lamindb_record_name__\": [\"from-df-a\", \"from-df-b\"],\n            \"from-df-score\": [1.0, 2.0],\n        }\n    )\n\n    records = ln.Record.from_dataframe(df, type=sheet)\n    assert len(records) == 2\n    records.save()\n    assert ln.Record.get(name=\"from-df-a\").features.get_values()[\"from-df-score\"] == 1.0\n\n    df2 = pd.DataFrame(\n        {\n            \"__lamindb_record_name__\": [\"from-df-c\"],\n            \"from-df-score\": [3.0],\n        }\n    )\n    records_2 = ln.Record.from_dataframe(df2, type=sheet)\n    records_2.save()\n    assert ln.Record.get(name=\"from-df-c\").features.get_values()[\"from-df-score\"] == 3.0\n\n    ln.Record.filter(name__in=[\"from-df-a\", \"from-df-b\", \"from-df-c\"]).delete(\n        permanent=True\n    )\n    ln.Record.filter(name=\"from-df-sheet\").delete(permanent=True)\n    schema.delete(permanent=True)\n    score.delete(permanent=True)\n\n\ndef test_record_from_dataframe_requires_named_type():\n    df = pd.DataFrame({\"__lamindb_record_name__\": [\"x\"], \"score\": [1.0]})\n    non_type_record = ln.Record(name=\"from-df-non-type\").save()\n    unnamed_type = ln.Record(name=\"from-df-temp-type\", is_type=True)\n    unnamed_type.name = None\n\n    with pytest.raises(ValueError, match=\"is_type=True\"):\n        ln.Record.from_dataframe(df, type=non_type_record)\n    with pytest.raises(ValueError, match=\"non-null `name`\"):\n        ln.Record.from_dataframe(df, type=unnamed_type)\n\n    non_type_record.delete(permanent=True)\n\n\ndef test_record_from_dataframe_with_string_type_creates_import_type():\n    score = ln.Feature(name=\"from-df-str-score\", dtype=float).save()\n    df = pd.DataFrame(\n        {\n            \"__lamindb_record_name__\": [\"from-df-str-a\", \"from-df-str-b\"],\n            \"from-df-str-score\": [11.0, 12.0],\n        }\n    )\n    imports_type = ln.Record.filter(uid=IMPORTS_UID).one_or_none()\n    original_imports_name = None\n    if imports_type is not None:\n        original_imports_name = imports_type.name\n        imports_type.name = \"from-df-renamed-imports-parent\"\n        imports_type.save()\n\n    try:\n        records = ln.Record.from_dataframe(df, type=\"from-df-str-type\")\n        created_type = ln.Record.get(name=\"from-df-str-type\", is_type=True)\n        imports_type = ln.Record.get(uid=IMPORTS_UID)\n\n        assert len(records) == 2\n        assert records.type.id == created_type.id\n        assert created_type.type_id == imports_type.id\n        assert created_type.schema.type is not None\n        assert created_type.schema.type.uid == SCHEMA_IMPORTS_UID\n        assert created_type.schema_id is not None\n\n        records.save()\n        assert (\n            ln.Record.get(name=\"from-df-str-a\").features.get_values()[\n                \"from-df-str-score\"\n            ]\n            == 11.0\n        )\n    finally:\n        created_type = ln.Record.filter(\n            name=\"from-df-str-type\", is_type=True\n        ).one_or_none()\n        ln.Record.filter(name__in=[\"from-df-str-a\", \"from-df-str-b\"]).delete(\n            permanent=True\n        )\n        ln.Record.filter(name=\"from-df-str-type\").delete(permanent=True)\n        if created_type is not None and created_type.schema_id is not None:\n            ln.Schema.filter(id=created_type.schema_id).delete(permanent=True)\n        if original_imports_name is not None:\n            imports_type = ln.Record.get(uid=IMPORTS_UID)\n            imports_type.name = original_imports_name\n            imports_type.save()\n        score.delete(permanent=True)\n\n\ndef test_record_from_dataframe_with_string_type_duplicate_name_errors():\n    score = ln.Feature(name=\"from-df-dup-score\", dtype=float).save()\n    schema = ln.Schema([score], name=\"from-df-dup-schema\").save()\n    imports_type = ln.Record.filter(uid=IMPORTS_UID).one_or_none()\n    if imports_type is None:\n        imports_type = ln.Record(name=\"Imports\", is_type=True)\n        imports_type.uid = IMPORTS_UID\n        imports_type = imports_type.save()\n    ln.Record(\n        name=\"from-df-dup-type\", is_type=True, schema=schema, type=imports_type\n    ).save()\n    df = pd.DataFrame(\n        {\n            \"__lamindb_record_name__\": [\"from-df-dup-a\"],\n            \"from-df-dup-score\": [21.0],\n        }\n    )\n\n    with pytest.raises(ValueError, match=\"already exists\"):\n        ln.Record.from_dataframe(df, type=\"from-df-dup-type\")\n\n    ln.Record.filter(name=\"from-df-dup-type\").delete(permanent=True)\n    schema.delete(permanent=True)\n    score.delete(permanent=True)\n\n\ndef test_feature_manager_raise_not_validated_values():\n    from lamindb.models._feature_manager import FeatureManager\n\n    assert FeatureManager._raise_not_validated_values({}) is None\n\n    with pytest.raises(ln.errors.ValidationError) as error:\n        FeatureManager._raise_not_validated_values(\n            {\n                \"Record\": (\"name\", [\"missing-record\"]),\n                \"bionty.Gene\": (\"symbol\", [\"missing-gene\"]),\n            }\n        )\n    message = str(error.value)\n    assert \"These values could not be validated\" in message\n    assert (\n        \"records = ln.Record.from_values(['missing-record'], field='name', create=True).save()\"\n        in message\n    )\n    assert (\n        \"records = bionty.Gene.from_values(['missing-gene'], field='symbol').save()\"\n        in message\n    )\n\n\ndef test_name_lookup():\n    my_type = ln.Record(name=\"MyType\", is_type=True).save()\n    label1 = ln.Record(name=\"label 1\", type=my_type).save()\n    label2 = ln.Record(name=\"label 1\", type=my_type)\n    assert label2 == label1\n    label2 = ln.Record(name=\"label 1\")\n    assert label2 != label1\n    label2.save()\n    label3 = ln.Record(name=\"label 1\")\n    assert label3 == label2\n    label2.delete(permanent=True)\n    label1.delete(permanent=True)\n    my_type.delete(permanent=True)\n\n\n@pytest.mark.skipif(\n    os.getenv(\"LAMINDB_TEST_DB_VENDOR\") == \"sqlite\", reason=\"Postgres-only\"\n)\ndef test_invalid_type_record_with_schema():\n    schema = ln.Schema(name=\"test_schema\", itype=ln.Feature).save()\n\n    record_type_with_schema = ln.Record(\n        name=\"TypeWithSchema\", is_type=True, schema=schema\n    ).save()\n\n    with pytest.raises(IntegrityError) as error:\n        ln.Record(name=\"InvalidType\", is_type=True, type=record_type_with_schema).save()\n    assert \"record_type_is_valid_fk\" in error.exconly()\n\n    record_type_with_schema.delete(permanent=True)\n    schema.delete(permanent=True)\n\n\n# see test_artifact_features_add_remove_query in test_artifact_external_features_annotations.py for similar test for Artifacts (populate and query by features)\ndef test_record_features_add_remove_values():\n    record_type1 = ln.Record(name=\"RecordType1\", is_type=True).save()\n    record_entity1 = ln.Record(name=\"entity1\", type=record_type1).save()\n    record_entity2 = ln.Record(name=\"entity2\", type=record_type1).save()\n    ulabel = ln.ULabel(name=\"test-ulabel\").save()\n    artifact = ln.Artifact(\".gitignore\", key=\"test-artifact\").save()\n    collection = ln.Collection(artifact, key=\"test-collection\").save()\n    transform = ln.Transform(key=\"test-transform\").save()\n    run = ln.Run(transform, name=\"test-run\").save()\n\n    feature_bool = ln.Feature(name=\"feature_bool\", dtype=bool).save()\n    feature_str = ln.Feature(name=\"feature_str\", dtype=str).save()\n    feature_list_str = ln.Feature(name=\"feature_list_str\", dtype=list[str]).save()\n    feature_int = ln.Feature(name=\"feature_int\", dtype=int).save()\n    feature_list_int = ln.Feature(name=\"feature_list_int\", dtype=list[int]).save()\n    feature_float = ln.Feature(name=\"feature_float\", dtype=float).save()\n    feature_list_float = ln.Feature(name=\"feature_list_float\", dtype=list[float]).save()\n    feature_num = ln.Feature(name=\"feature_num\", dtype=\"num\").save()\n    feature_url = ln.Feature(name=\"feature_url\", dtype=\"url\").save()\n    feature_list_num = ln.Feature(name=\"feature_list_num\", dtype=\"list[num]\").save()\n    feature_datetime = ln.Feature(name=\"feature_datetime\", dtype=datetime).save()\n    feature_date = ln.Feature(name=\"feature_date\", dtype=datetime.date).save()\n    feature_dict = ln.Feature(name=\"feature_dict\", dtype=dict).save()\n    feature_type1 = ln.Feature(name=\"feature_type1\", dtype=record_type1).save()\n    feature_type1s = ln.Feature(name=\"feature_type1s\", dtype=list[record_type1]).save()\n    feature_user = ln.Feature(name=\"feature_user\", dtype=ln.User).save()\n    feature_ulabel = ln.Feature(name=\"feature_ulabel\", dtype=ln.ULabel).save()\n    feature_project = ln.Feature(name=\"feature_project\", dtype=ln.Project).save()\n    feature_artifact = ln.Feature(name=\"feature_artifact\", dtype=ln.Artifact).save()\n    feature_collection = ln.Feature(\n        name=\"feature_collection\", dtype=ln.Collection\n    ).save()\n    feature_run = ln.Feature(name=\"feature_run\", dtype=ln.Run.uid).save()\n    feature_cell_line = ln.Feature(name=\"feature_cell_line\", dtype=bt.CellLine).save()\n    feature_cell_lines = ln.Feature(\n        name=\"feature_cell_lines\", dtype=list[bt.CellLine]\n    ).save()\n    feature_cl_ontology_id = ln.Feature(\n        name=\"feature_cl_ontology_id\", dtype=bt.CellLine.ontology_id\n    ).save()\n    feature_gene = ln.Feature(name=\"feature_gene\", dtype=bt.Gene).save()\n\n    test_record = ln.Record(name=\"test_record\").save()\n    test_project = ln.Project(name=\"test_project\").save()\n    hek293 = bt.CellLine.from_source(name=\"HEK293\").save()\n    a549 = bt.CellLine.from_source(name=\"A-549\").save()\n    tmem276 = bt.Gene.from_source(symbol=\"Tmem276\", organism=\"mouse\").save()\n\n    # test feature.dtype_as_object\n    assert feature_bool.dtype_as_object is bool\n    assert feature_str.dtype_as_object is str\n    assert feature_list_str.dtype_as_object == list[str]\n    assert feature_int.dtype_as_object is int\n    assert feature_list_int.dtype_as_object == list[int]\n    assert feature_float.dtype_as_object is float\n    assert feature_list_float.dtype_as_object == list[float]\n    assert feature_num.dtype_as_object is float\n    assert feature_url.dtype_as_object is str\n    assert feature_list_num.dtype_as_object == list[float]\n    assert feature_datetime.dtype_as_object == datetime\n    assert feature_date.dtype_as_object == date\n    assert feature_dict.dtype_as_object is dict\n    assert feature_type1.dtype_as_object == record_type1\n    assert feature_type1s.dtype_as_object == list[record_type1]\n    assert feature_user.dtype_as_object == ln.User.handle\n    assert feature_ulabel.dtype_as_object == ln.ULabel.name\n    assert feature_project.dtype_as_object == ln.Project.name\n    assert feature_artifact.dtype_as_object == ln.Artifact.key\n    assert feature_collection.dtype_as_object == ln.Collection.key\n    assert feature_run.dtype_as_object == ln.Run.uid\n    assert feature_cell_line.dtype_as_object == bt.CellLine.name\n    assert feature_cell_lines.dtype_as_object == list[bt.CellLine.name]\n    assert feature_cl_ontology_id.dtype_as_object == bt.CellLine.ontology_id\n    assert feature_gene.dtype_as_object == bt.Gene.symbol\n\n    # no schema validation\n    test_values = {\n        \"feature_bool\": True,\n        \"feature_str\": \"00810702-0006\",  # this string value could be cast to datetime! don't change!\n        \"feature_list_str\": [\"a\", \"list\", \"of\", \"strings\"],\n        \"feature_int\": 42,\n        \"feature_list_int\": [1, 2, 3],\n        \"feature_num\": 3.14,\n        \"feature_url\": \"https://lamin.ai/docs\",\n        \"feature_list_num\": [2.71, 3.14, 1.61],\n        \"feature_float\": 3.14,\n        \"feature_list_float\": [2.71, 3.14, 1.61],\n        \"feature_datetime\": datetime(2024, 1, 1, 12, 0, 0),\n        \"feature_date\": date(2024, 1, 1),\n        \"feature_dict\": {\"key\": \"value\", \"number\": 123, \"list\": [1, 2, 3]},\n        \"feature_type1\": \"entity1\",\n        \"feature_type1s\": [\"entity1\", \"entity2\"],\n        \"feature_ulabel\": \"test-ulabel\",\n        \"feature_user\": ln.setup.settings.user.handle,\n        \"feature_project\": \"test_project\",\n        \"feature_cell_line\": \"HEK293\",\n        \"feature_cell_lines\": [\"HEK293\", \"A-549\"],\n        \"feature_gene\": \"Tmem276\",\n        \"feature_cl_ontology_id\": \"CVCL_0045\",\n        \"feature_artifact\": \"test-artifact\",\n        \"feature_collection\": \"test-collection\",\n        \"feature_run\": run.uid,\n    }\n\n    test_record.features.add_values(test_values)\n    assert test_record.features.get_values() == test_values\n\n    # --- Query by features (same data as above) ---\n    # Equality\n    assert ln.Record.filter(feature_str=test_values[\"feature_str\"]).one() == test_record\n    assert ln.Record.filter(feature_int=42).one() == test_record\n    assert ln.Record.filter(feature_type1=\"entity1\").one() == test_record\n    assert ln.Record.filter(feature_cell_line=\"HEK293\").one() == test_record\n    assert ln.Record.filter(feature_url=\"https://lamin.ai/docs\").one() == test_record\n    assert (\n        ln.Record.filter(feature_str=test_values[\"feature_str\"], feature_int=42).one()\n        == test_record\n    )\n    # Datetime and date (filter uses ISO strings as stored in JSON)\n    assert ln.Record.filter(feature_datetime=\"2024-01-01T12:00:00\").one() == test_record\n    assert ln.Record.filter(feature_date=\"2024-01-01\").one() == test_record\n    # __contains (categorical)\n    assert ln.Record.filter(feature_cell_line__contains=\"HEK\").one() == test_record\n    assert ln.Record.filter(feature_type1__contains=\"entity\").one() == test_record\n    # Invalid field\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        ln.Record.filter(feature_str_typo=\"x\", feature_int=42).one()\n    assert error.exconly().startswith(\n        \"lamindb.errors.InvalidArgument: You can query either by available fields:\"\n    )\n    # DoesNotExist (no Record named \"nonexistent_entity\" exists)\n    with pytest.raises(ln.errors.ObjectDoesNotExist) as error:\n        ln.Record.filter(feature_type1=\"nonexistent_entity\").one()\n    assert \"Did not find\" in error.exconly()\n\n    # Combined filter (3 keys)\n    assert (\n        ln.Record.filter(\n            feature_str=test_values[\"feature_str\"],\n            feature_int=42,\n            feature_type1=\"entity1\",\n        ).one()\n        == test_record\n    )\n    # Bionty: filter by record\n    assert ln.Record.filter(feature_cell_line=hek293).one() == test_record\n    # Bionty: filter by ontology_id string\n    assert ln.Record.filter(feature_cl_ontology_id=\"CVCL_0045\").one() == test_record\n    # Bionty __contains (ontology_id)\n    assert (\n        ln.Record.filter(feature_cl_ontology_id__contains=\"0045\").one() == test_record\n    )\n    # DoesNotExist (Record not found: feature_project)\n    with pytest.raises(ln.errors.ObjectDoesNotExist) as error:\n        ln.Record.filter(feature_project=\"nonexistent_project\").one()\n    assert \"Did not find\" in error.exconly()\n    # __contains returns multiple (add second record, assert, then remove)\n    value_record = ln.Record(name=\"query_test_value_record\").save()\n    value_record.features.add_values({\"feature_type1\": \"entity2\"})\n    assert len(ln.Record.filter(feature_type1__contains=\"entity\")) == 2\n    value_record.features.remove_values(\"feature_type1\")\n    value_record.delete(permanent=True)\n    # Numeric comparators __lt, __gt (int, float, num)\n    assert ln.Record.filter(feature_int__lt=21).one_or_none() is None\n    assert len(ln.Record.filter(feature_int__gt=21)) >= 1\n    # int __lt/__gt that would fail with string comparison (42 vs 5, 42 vs 100)\n    assert ln.Record.filter(feature_int__lt=5).one_or_none() is None\n    assert ln.Record.filter(feature_int__gt=100).one_or_none() is None\n    # float/num __lt/__gt (numeric comparison on SQLite via json_extract + CAST)\n    assert ln.Record.filter(feature_float__lt=5.0).one() == test_record\n    assert ln.Record.filter(feature_float__gt=1.0).one() == test_record\n    assert ln.Record.filter(feature_float__gt=10.0).one_or_none() is None\n    assert ln.Record.filter(feature_num__lt=5.0).one() == test_record\n    assert ln.Record.filter(feature_num__gt=1.0).one() == test_record\n    assert ln.Record.filter(feature_num__gt=10.0).one_or_none() is None\n    # Date and datetime comparators (ISO strings)\n    assert ln.Record.filter(feature_date__lt=\"2024-01-02\").one() == test_record\n    assert ln.Record.filter(feature_date__gt=\"2023-12-31\").one() == test_record\n    assert ln.Record.filter(feature_date__gt=\"2024-01-02\").one_or_none() is None\n    assert (\n        ln.Record.filter(feature_datetime__lt=\"2024-01-01T13:00:00\").one()\n        == test_record\n    )\n    assert (\n        ln.Record.filter(feature_datetime__gt=\"2024-01-01T11:00:00\").one()\n        == test_record\n    )\n    assert (\n        ln.Record.filter(feature_datetime__lt=\"2024-01-01T11:00:00\").one_or_none()\n        is None\n    )\n\n    # ManyToMany accesors\n\n    assert set(test_record.linked_records.to_list()) == {record_entity1, record_entity2}\n    assert test_record.linked_in_records.count() == 0\n    assert set(record_entity1.linked_in_records.to_list()) == {test_record}\n    assert set(record_entity2.linked_in_records.to_list()) == {test_record}\n    assert record_entity1.linked_records.count() == 0\n    assert record_entity2.linked_records.count() == 0\n\n    # all empty sheet\n\n    schema = ln.Schema(\n        [\n            feature_bool,\n            feature_str,\n            feature_int,\n            feature_list_str,\n            feature_list_int,\n            feature_num,\n            feature_url,\n            feature_float,\n            feature_list_float,\n            feature_list_num,\n            feature_datetime,\n            feature_date,\n            feature_dict,\n            feature_type1,\n            feature_type1s,\n            feature_ulabel,\n            feature_user,\n            feature_project,\n            feature_cell_line,\n            feature_cell_lines,\n            feature_cl_ontology_id,\n            feature_gene,\n            feature_artifact,\n            feature_collection,\n            feature_run,\n        ],\n        name=\"test_schema\",\n    ).save()\n    sheet = ln.Record(name=\"Sheet\", is_type=True, schema=schema).save()\n    empty_record = ln.Record(name=\"empty_record\", type=sheet).save()\n    df_empty = sheet.to_dataframe()\n\n    assert df_empty[\"feature_bool\"].isnull().all()\n    assert df_empty[\"feature_bool\"].dtype.name == \"boolean\"\n    assert df_empty[\"feature_str\"].isnull().all()\n    assert df_empty[\"feature_str\"].dtype.name == \"string\"\n    assert df_empty[\"feature_int\"].isnull().all()\n    assert df_empty[\"feature_int\"].dtype.name == \"Int64\"\n    assert df_empty[\"feature_float\"].isnull().all()\n    assert df_empty[\"feature_float\"].dtype.name == \"float64\"\n    assert df_empty[\"feature_num\"].isnull().all()\n    assert df_empty[\"feature_num\"].dtype.name == \"float64\"\n    assert df_empty[\"feature_url\"].isnull().all()\n    assert df_empty[\"feature_url\"].dtype.name == \"string\"\n    assert df_empty[\"feature_list_str\"].isnull().all()\n    assert df_empty[\"feature_list_str\"].dtype.name == \"object\"\n    assert df_empty[\"feature_list_int\"].isnull().all()\n    assert df_empty[\"feature_list_int\"].dtype.name == \"object\"\n    assert df_empty[\"feature_datetime\"].isnull().all()\n    assert df_empty[\"feature_datetime\"].dtype.name == \"datetime64[ns]\"\n    assert df_empty[\"feature_date\"].isnull().all()\n    assert df_empty[\"feature_date\"].dtype.name == \"object\"\n    assert df_empty[\"feature_dict\"].isnull().all()\n    assert df_empty[\"feature_dict\"].dtype.name == \"object\"\n    assert df_empty[\"feature_type1\"].isnull().all()\n    assert df_empty[\"feature_type1\"].dtype.name == \"category\"\n    assert df_empty[\"feature_type1s\"].isnull().all()\n    assert df_empty[\"feature_type1s\"].dtype.name == \"object\"\n    assert df_empty[\"feature_ulabel\"].isnull().all()\n    assert df_empty[\"feature_ulabel\"].dtype.name == \"category\"\n    assert df_empty[\"feature_user\"].isnull().all()\n    assert df_empty[\"feature_user\"].dtype.name == \"category\"\n    assert df_empty[\"feature_project\"].isnull().all()\n    assert df_empty[\"feature_project\"].dtype.name == \"category\"\n    assert df_empty[\"feature_cell_line\"].isnull().all()\n    assert df_empty[\"feature_cell_line\"].dtype.name == \"category\"\n    assert df_empty[\"feature_cell_lines\"].isnull().all()\n    assert df_empty[\"feature_cell_lines\"].dtype.name == \"object\"\n    assert df_empty[\"feature_cl_ontology_id\"].isnull().all()\n    assert df_empty[\"feature_cl_ontology_id\"].dtype.name == \"category\"\n    assert df_empty[\"feature_artifact\"].isnull().all()\n    assert df_empty[\"feature_artifact\"].dtype.name == \"category\"\n    assert df_empty[\"feature_collection\"].isnull().all()\n    assert df_empty[\"feature_collection\"].dtype.name == \"category\"\n    assert df_empty[\"feature_run\"].isnull().all()\n    assert df_empty[\"feature_run\"].dtype.name == \"category\"\n\n    # remove empty record from sheet\n    empty_record.type = None\n    empty_record.save()\n\n    # sheet with values\n\n    test_record.type = sheet\n    test_record.save()\n    df = sheet.to_dataframe()\n    target_result = {\n        \"feature_bool\": True,\n        \"feature_str\": \"00810702-0006\",  # this string value could be cast to datetime!\n        \"feature_list_str\": [\"a\", \"list\", \"of\", \"strings\"],\n        \"feature_int\": 42,\n        \"feature_list_int\": [1, 2, 3],\n        \"feature_float\": 3.14,\n        \"feature_list_float\": [2.71, 3.14, 1.61],\n        \"feature_num\": 3.14,\n        \"feature_url\": \"https://lamin.ai/docs\",\n        \"feature_list_num\": [2.71, 3.14, 1.61],\n        \"feature_datetime\": pd.Timestamp(\"2024-01-01 12:00:00\"),\n        \"feature_date\": date(2024, 1, 1),\n        \"feature_dict\": {\"key\": \"value\", \"list\": [1, 2, 3], \"number\": 123},\n        \"feature_type1\": \"entity1\",\n        \"feature_ulabel\": \"test-ulabel\",\n        \"feature_user\": ln.setup.settings.user.handle,\n        \"feature_project\": \"test_project\",\n        \"feature_cell_line\": \"HEK293\",\n        \"feature_cl_ontology_id\": \"CVCL_0045\",\n        \"feature_gene\": \"Tmem276\",\n        \"feature_artifact\": \"test-artifact\",\n        \"feature_collection\": \"test-collection\",\n        \"feature_run\": run.uid,\n        \"__lamindb_record_uid__\": test_record.uid,\n        \"__lamindb_record_name__\": \"test_record\",\n    }\n    result = df.to_dict(orient=\"records\")[0]\n    # need to handle categorical lists differently because\n    # we don't yet respect ordering\n    result_feature_type1s = result.pop(\"feature_type1s\")\n    assert set(result_feature_type1s) == {\"entity1\", \"entity2\"}\n    assert isinstance(result_feature_type1s, list)\n    result_feature_cell_lines = result.pop(\"feature_cell_lines\")\n    assert set(result_feature_cell_lines) == {\"HEK293\", \"A-549\"}\n    assert isinstance(result_feature_cell_lines, list)\n    assert result == target_result\n\n    # export to artifact to trigger validation -- this will raise many errors if anything is inconsistent\n\n    sheet_as_artifact = sheet.to_artifact()\n\n    # could devise a test for get_values or features.describe()\n    # but this is extensively tested elsewhere\n    # print(sheet_as_artifact.features.get_values())\n    # assert sheet_as_artifact.features.get_values()\n\n    sheet_as_artifact.delete(permanent=True)\n\n    # add the empty record back to the sheet and export again\n\n    empty_record.type = sheet\n    empty_record.save()\n    df = sheet.to_dataframe()\n    sheet_as_artifact = sheet.to_artifact()\n    sheet_as_artifact.delete(permanent=True)\n\n    # test passing ISO-format date string for date\n\n    test_record2 = ln.Record(name=\"test_record\").save()\n    # we could also test different ways of formatting but don't yet do that\n    # in to_dataframe() we enforce ISO format already\n    feature_date = ln.Feature.get(name=\"feature_date\")\n    feature_date.coerce = True  # have to allow coercion because we're passing a string\n    feature_date.save()\n    test_values[\"feature_date\"] = \"2024-01-02\"\n    test_record2.features.add_values(test_values)\n    test_record2.type = sheet\n    test_record2.save()\n    test_values[\"feature_date\"] = date(2024, 1, 2)\n    assert test_record2.features.get_values() == test_values\n    assert test_record.features.get_values() != test_values\n\n    # also test export to artifact again\n    sheet_as_artifact = sheet.to_artifact()\n    sheet_as_artifact.delete(permanent=True)\n    test_record2.delete(permanent=True)\n    empty_record.delete(permanent=True)\n\n    # test move a value into the trash\n\n    record_entity1.delete()\n    test_values.pop(\"feature_type1\")\n    test_values[\"feature_type1s\"] = [\"entity2\"]\n    test_values[\"feature_date\"] = date(2024, 1, 1)\n    assert test_record.features.get_values() == test_values\n\n    df = sheet.to_dataframe()\n    result = df.to_dict(orient=\"records\")[0]\n    result_feature_type1s = result.pop(\"feature_type1s\")\n    assert set(result_feature_type1s) == {\"entity2\"}\n    assert isinstance(result_feature_type1s, list)\n    result_feature_cell_lines = result.pop(\"feature_cell_lines\")\n    assert set(result_feature_cell_lines) == {\"HEK293\", \"A-549\"}\n    assert isinstance(result_feature_cell_lines, list)\n    target_result.pop(\"feature_type1\")\n    assert pd.isna(result.pop(\"feature_type1\"))\n    assert result == target_result\n\n    record_entity1.restore()\n    test_values[\"feature_type1\"] = \"entity1\"\n    test_values[\"feature_type1s\"] = [\"entity1\", \"entity2\"]\n\n    # remove values\n\n    test_record.features.remove_values(\"feature_int\")\n    test_values.pop(\"feature_int\")\n    assert test_record.features.get_values() == test_values\n\n    test_record.features.remove_values(\"feature_date\")\n    test_values.pop(\"feature_date\")\n    assert test_record.features.get_values() == test_values\n\n    test_record.features.remove_values(\"feature_type1\")\n    test_values.pop(\"feature_type1\")\n    assert test_record.features.get_values() == test_values\n\n    test_record.features.remove_values(\"feature_type1s\")\n    test_values.pop(\"feature_type1s\")\n    assert test_record.features.get_values() == test_values\n\n    test_record.features.remove_values(\"feature_ulabel\")\n    test_values.pop(\"feature_ulabel\")\n    assert test_record.features.get_values() == test_values\n\n    test_record.features.remove_values(\"feature_cell_line\")\n    test_values.pop(\"feature_cell_line\")\n    assert test_record.features.get_values() == test_values\n\n    test_record.features.remove_values(\"feature_user\")\n    test_values.pop(\"feature_user\")\n    assert test_record.features.get_values() == test_values\n\n    test_record.features.remove_values(\"feature_artifact\")\n    test_values.pop(\"feature_artifact\")\n    assert test_record.features.get_values() == test_values\n\n    test_record.features.remove_values(\"feature_collection\")\n    test_values.pop(\"feature_collection\")\n    assert test_record.features.get_values() == test_values\n\n    test_record.features.remove_values(\"feature_run\")\n    test_values.pop(\"feature_run\")\n    assert test_record.features.get_values() == test_values\n\n    # test passing None has no effect, does not lead to annotation\n\n    sheet.schema = None\n    sheet.save()\n    schema.delete(permanent=True)\n\n    test_record.features.add_values({\"feature_int\": None, \"feature_type1\": None})\n    assert test_record.features.get_values() == test_values\n\n    # schema validation\n\n    feature_str = ln.Feature.get(name=\"feature_str\")\n    feature_int = ln.Feature.get(name=\"feature_int\")\n    schema = ln.Schema([feature_str, feature_int], name=\"test_schema\").save()\n    test_form = ln.Record(name=\"TestForm\", is_type=True, schema=schema).save()\n    test_record_in_form = ln.Record(name=\"test_record_in_form\", type=test_form).save()\n    with pytest.raises(ln.errors.ValidationError) as error:\n        test_record_in_form.features.add_values({\"feature_type1\": \"entity1\"})\n    assert \"COLUMN_NOT_IN_DATAFRAME\" in error.exconly()\n    test_record_in_form.delete(permanent=True)\n    test_form.delete(permanent=True)\n    schema.delete(permanent=True)\n\n    # test with list of strings\n\n    schema = ln.Schema([feature_cell_lines], name=\"test_schema2\").save()\n    test_form = ln.Record(name=\"TestForm\", is_type=True, schema=schema).save()\n    test_record_in_form = ln.Record(name=\"test_record_in_form\", type=test_form).save()\n    test_record_in_form.features.add_values({\"feature_cell_lines\": [\"HEK293\", \"A-549\"]})\n    test_record_in_form.delete(permanent=True)\n    test_form.delete(permanent=True)\n    schema.delete(permanent=True)\n\n    # test with list of records (rather than passing strings)\n\n    schema = ln.Schema([feature_cell_lines], name=\"test_schema2\").save()\n    test_form = ln.Record(name=\"TestForm\", is_type=True, schema=schema).save()\n    test_record_in_form = ln.Record(name=\"test_record_in_form\", type=test_form).save()\n    test_record_in_form.features.add_values({\"feature_cell_lines\": [a549, hek293]})\n    test_record_in_form.delete(permanent=True)\n    test_form.delete(permanent=True)\n    schema.delete(permanent=True)\n\n    # clean up rest\n    test_record_id = test_record.id\n    assert ln.models.RecordJson.filter(record_id=test_record_id).count() > 0\n    test_record.delete(permanent=True)\n    # test CASCADE deletion of RecordJson\n    assert ln.models.RecordJson.filter(record_id=test_record_id).count() == 0\n    sheet.delete(permanent=True)\n    feature_str.delete(permanent=True)\n    feature_list_str.delete(permanent=True)\n    feature_int.delete(permanent=True)\n    feature_list_int.delete(permanent=True)\n    feature_datetime.delete(permanent=True)\n    feature_date.delete(permanent=True)\n    feature_type1.delete(permanent=True)\n    feature_type1s.delete(permanent=True)\n    feature_ulabel.delete(permanent=True)\n    feature_user.delete(permanent=True)\n    feature_project.delete(permanent=True)\n    feature_dict.delete(permanent=True)\n    feature_artifact.delete(permanent=True)\n    feature_run.delete(permanent=True)\n    feature_cell_lines.delete(permanent=True)\n    record_entity1.delete(permanent=True)\n    record_entity2.delete(permanent=True)\n    record_type1.delete(permanent=True)\n    test_project.delete(permanent=True)\n    feature_cell_line.delete(permanent=True)\n    feature_cl_ontology_id.delete(permanent=True)\n    feature_collection.delete(permanent=True)\n    feature_gene.delete(permanent=True)\n    hek293.delete(permanent=True)\n    a549.delete(permanent=True)\n    tmem276.delete(permanent=True)\n    ulabel.delete(permanent=True)\n    collection.delete(permanent=True)\n    artifact.delete(permanent=True)\n    run.delete(permanent=True)\n    transform.delete(permanent=True)\n    feature_num.delete(permanent=True)\n    feature_url.delete(permanent=True)\n\n\ndef test_date_and_datetime_corruption():\n    feature_datetime = ln.Feature(\n        name=\"feature_datetime\", dtype=datetime, coerce=True\n    ).save()\n    feature_date = ln.Feature(\n        name=\"feature_date\", dtype=datetime.date, coerce=True\n    ).save()\n    schema = ln.Schema(\n        [feature_datetime, feature_date], name=\"test_schema_date_datetime\"\n    ).save()\n    test_sheet = ln.Record(name=\"TestSheet\", is_type=True).save()\n    record = ln.Record(name=\"test_record\", type=test_sheet).save()\n\n    # pass values with Z suffix\n    test_values = {\n        \"feature_datetime\": \"2024-01-01T12:00:00Z\",\n        \"feature_date\": \"2025-01-17\",\n    }\n    record.features.add_values(test_values)\n    date_value = ln.models.RecordJson.get(record=record, feature=feature_date)\n    # manually corrupt the value\n    date_value.value = \"2025-01-17T00:00:00.000Z\"\n    date_value.save()\n    assert record.features.get_values() == {\n        \"feature_datetime\": pd.Timestamp(\"2024-01-01 12:00:00\", tz=\"UTC\"),\n        \"feature_date\": date(2025, 1, 17),\n    }\n    record.schema = schema\n    record.save()\n\n    df = test_sheet.to_dataframe()\n    result = df.to_dict(orient=\"records\")[0]\n    # because in a dataframe we'll hit pandera and pandera expects naive\n    # timestamps, to_dataframe() converts to naive by removing timezone info\n    assert result[\"feature_datetime\"] == pd.Timestamp(\"2024-01-01 12:00:00\")\n    assert result[\"feature_date\"] == date(2025, 1, 17)\n\n    record.delete(permanent=True)\n    test_sheet.delete(permanent=True)\n    schema.delete(permanent=True)\n    feature_datetime.delete(permanent=True)\n    feature_date.delete(permanent=True)\n\n\ndef test_only_list_type_features_and_field_qualifiers():\n    # this test is necessary because the logic for adding link tables\n    # to the query previously only fired when a non-list cat feature of the same type was present\n    feature_cell_lines = ln.Feature(\n        name=\"feature_cell_lines\", dtype=list[bt.CellLine]\n    ).save()\n    feature_list_ontology_id = ln.Feature(\n        name=\"feature_list_ontology_id\", dtype=list[bt.Tissue.ontology_id]\n    ).save()\n    schema = ln.Schema(\n        [feature_cell_lines, feature_list_ontology_id], name=\"test_schema2\"\n    ).save()\n    # create a feature with the same name to test robustness w.r.t. to this\n    feature_type = ln.Feature(name=\"FeatureTypeX\", is_type=True).save()\n    feature_cell_lines_duplicate = ln.Feature(\n        name=\"feature_cell_lines\", dtype=bt.CellLine, type=feature_type\n    ).save()\n\n    test_sheet = ln.Record(name=\"TestSheet\", is_type=True, schema=schema).save()\n    record = ln.Record(name=\"test_record\", type=test_sheet).save()\n    hek293 = bt.CellLine.from_source(name=\"HEK293\").save()\n    a549 = bt.CellLine.from_source(name=\"A-549\").save()\n    uberon2369 = bt.Tissue.from_source(ontology_id=\"UBERON:0002369\").save()\n    uberon5172 = bt.Tissue.from_source(ontology_id=\"UBERON:0005172\").save()\n\n    test_values = {\n        \"feature_cell_lines\": [\"HEK293\", \"A-549\"],\n        \"feature_list_ontology_id\": [\"UBERON:0002369\", \"UBERON:0005172\"],\n    }\n\n    record.features.add_values(test_values)\n    assert record.features.get_values() == test_values\n\n    df = test_sheet.to_dataframe()\n    result = df.to_dict(orient=\"records\")[0]\n    assert isinstance(result[\"feature_cell_lines\"], list)\n    assert isinstance(result[\"feature_list_ontology_id\"], list)\n    assert set(result[\"feature_cell_lines\"]) == {\"HEK293\", \"A-549\"}\n    assert set(result[\"feature_list_ontology_id\"]) == {\n        \"UBERON:0002369\",\n        \"UBERON:0005172\",\n    }\n\n    # add another record\n    record2 = ln.Record(name=\"test_record2\", type=test_sheet).save()\n    test_values2 = {\n        \"feature_cell_lines\": [\"HEK293\"],\n        \"feature_list_ontology_id\": [\"UBERON:0005172\"],\n    }\n    record2.features.add_values(test_values2)\n\n    # trigger validation of the case that has two and a single record\n    # this tests type casting in list-like values\n    artifact = test_sheet.to_artifact()\n    assert (\n        len(artifact.schemas.first().members) == 2\n    )  # this requires top most match filtering during validation\n\n    record.delete(permanent=True)\n    record2.delete(permanent=True)\n    test_sheet.delete(permanent=True)\n    inferred_schema = artifact.schemas.first()\n    artifact.delete(permanent=True)\n    inferred_schema.delete(permanent=True)\n    schema.delete(permanent=True)\n    feature_cell_lines.delete(permanent=True)\n    feature_cell_lines_duplicate.delete(permanent=True)\n    feature_type.delete(permanent=True)\n    hek293.delete(permanent=True)\n    a549.delete(permanent=True)\n    uberon2369.delete(permanent=True)\n    uberon5172.delete(permanent=True)\n\n\ndef test_record_feature_predicate_query():\n    age = ln.Feature(name=\"pred_record_age\", dtype=int).save()\n    record_type = ln.Record(name=\"PredRecordType\", is_type=True).save()\n    record_a = ln.Record(name=\"pred_record_a\", type=record_type).save()\n    record_b = ln.Record(name=\"pred_record_b\", type=record_type).save()\n    record_a.features.add_values({\"pred_record_age\": 42})\n    record_b.features.add_values({\"pred_record_age\": 10})\n\n    assert ln.Record.filter(age > 40).one() == record_a\n    assert ln.Record.filter(age <= 10).one() == record_b\n    neq_results = ln.Record.filter(age != 42)\n    assert record_b in neq_results\n    assert record_a not in neq_results\n\n    record_a.delete(permanent=True)\n    record_b.delete(permanent=True)\n    record_type.delete(permanent=True)\n    age.delete(permanent=True)\n\n\ndef test_record_features_accept_feature_object_keys():\n    feature_score = ln.Feature(name=\"record_feature_object_score\", dtype=int).save()\n    feature_tag = ln.Feature(name=\"record_feature_object_tag\", dtype=str).save()\n    record = ln.Record(name=\"record_feature_object_test\").save()\n\n    record.features.add_values({feature_score: 7, \"record_feature_object_tag\": \"a\"})\n    assert record.features.get_values() == {\n        \"record_feature_object_score\": 7,\n        \"record_feature_object_tag\": \"a\",\n    }\n\n    # set_values should also accept Feature objects as dictionary keys.\n    record.features.set_values({feature_tag: \"b\"})\n    assert record.features.get_values() == {\"record_feature_object_tag\": \"b\"}\n\n    record.features.add_values({feature_score: 9})\n    assert record.features.get_values() == {\n        \"record_feature_object_score\": 9,\n        \"record_feature_object_tag\": \"b\",\n    }\n\n    # remove_values supports dictionary inputs with Feature keys.\n    record.features.remove_values({feature_score: 9, feature_tag: None})\n    assert record.features.get_values() == {}\n\n    record.delete(permanent=True)\n    feature_score.delete(permanent=True)\n    feature_tag.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_record_sheet_examples.py",
    "content": "import lamindb as ln\nimport pandas as pd\nfrom lamindb.examples.fixtures.sheets import (\n    populate_nextflow_sheet_with_samples,  # noqa: F401\n    populate_sheets_compound_treatment,  # noqa: F401\n)\n\n\ndef test_float_int_casting():\n    # this test is only needed for as long as we let JS write data into RecordJson\n    # for JS a 3 is a valid float even though any python json parser interprets it as an int\n    feature_int = ln.Feature(name=\"feature_int\", dtype=int).save()\n    feature_float = ln.Feature(name=\"feature_float\", dtype=float).save()\n    test_schema = ln.Schema([feature_int, feature_float], name=\"test_schema\").save()\n    sheet = ln.Record(name=\"TestSheet\", is_type=True, schema=test_schema).save()\n    record = ln.Record(name=\"test_record\", type=sheet).save()\n    record.features.add_values({\"feature_int\": 5, \"feature_float\": 3.0})\n    record_json = ln.models.RecordJson.get(record=record, feature=feature_float)\n    record_json.value = 3\n    record_json.save()\n    df = sheet.to_dataframe()\n    assert df[\"feature_int\"].dtype.name == \"int64\"\n    assert df[\"feature_float\"].dtype.name == \"float64\"\n    # this export call would error if we didn't have type casting\n    artifact = sheet.to_artifact()\n\n    related_schemas = list(artifact.schemas.all())\n    artifact.schemas.clear()\n    artifact.delete(permanent=True)\n    record.delete(permanent=True)\n    sheet.delete(permanent=True)\n    for schema in related_schemas:\n        schema.delete(permanent=True)\n    # schema.delete(permanent=True), not necessary because already deleted above\n    feature_float.delete(permanent=True)\n    feature_int.delete(permanent=True)\n\n\ndef test_record_example_compound_treatment(\n    populate_sheets_compound_treatment: tuple[ln.Record, ln.Record],  # noqa: F811\n):\n    treatments_sheet, sample_sheet1 = populate_sheets_compound_treatment\n\n    dictionary = (\n        ln.Record.filter(type=treatments_sheet)\n        .to_dataframe()[[\"is_type\", \"name\"]]\n        .to_dict(orient=\"list\")\n    )\n    assert dictionary == {\n        \"is_type\": [\n            False,\n            False,\n        ],\n        \"name\": [\n            \"treatment2\",\n            \"treatment1\",\n        ],\n    }\n\n    dictionary = (\n        ln.Record.filter(type=treatments_sheet)\n        .to_dataframe(features=True)[[\"compound\", \"concentration\", \"name\"]]\n        .to_dict(orient=\"list\")\n    )\n    assert dictionary == {\n        \"compound\": [\n            \"drug2\",\n            \"drug1\",\n        ],\n        \"concentration\": [\n            \"4nM\",\n            \"2nM\",\n        ],\n        \"name\": [\n            \"treatment2\",\n            \"treatment1\",\n        ],\n    }\n\n    dictionary = (\n        ln.Record.filter(type=sample_sheet1)\n        .to_dataframe(features=[\"cell_line\", \"treatment\"])[\n            [\"cell_line\", \"__lamindb_record_name__\", \"treatment\"]\n        ]\n        .to_dict(orient=\"list\")\n    )\n    assert dictionary == {\n        \"cell_line\": [\n            \"HEK293T\",\n            \"HEK293T\",\n        ],\n        \"__lamindb_record_name__\": [\n            \"sample2\",\n            \"sample1\",\n        ],\n        \"treatment\": [\n            \"treatment2\",\n            \"treatment1\",\n        ],\n    }\n\n    assert sample_sheet1.input_of_runs.count() == 0\n    df = sample_sheet1.to_dataframe()\n    assert sample_sheet1.input_of_runs.count() == 1\n    assert df.index.name == \"__lamindb_record_id__\"\n    dictionary = df[\n        [\n            \"id\",  # a feature\n            \"uid\",  # a feature\n            \"name\",  # a feature\n            \"cell_line\",\n            \"treatment\",\n            \"preparation_date\",\n            \"__lamindb_record_name__\",\n        ]\n    ].to_dict(orient=\"list\")\n    assert dictionary == {\n        \"id\": [1, 2],\n        \"uid\": [\"S1\", \"S2\"],\n        \"name\": [\"Sample 1\", \"Sample 2\"],\n        \"cell_line\": [\n            \"HEK293T\",\n            \"HEK293T\",\n        ],\n        \"preparation_date\": [\n            pd.to_datetime(\"2025-06-01T05:00:00\"),\n            pd.to_datetime(\"2025-06-01T06:00:00\"),\n        ],\n        \"treatment\": [\n            \"treatment1\",\n            \"treatment2\",\n        ],\n        \"__lamindb_record_name__\": [\n            \"sample1\",\n            \"sample2\",\n        ],\n    }\n\n    artifact = sample_sheet1.to_artifact()\n    assert sample_sheet1.schema.members.to_list(\"name\") == [\n        \"id\",\n        \"uid\",\n        \"name\",\n        \"treatment\",\n        \"cell_line\",\n        \"preparation_date\",\n        \"project\",\n    ]\n    assert artifact.run.input_records.count() == 1\n    assert artifact.transform.kind == \"function\"\n    assert artifact.transform.key == \"__lamindb_record_export__\"\n    # looks something like this:\n    # id,uid,name,treatment,cell_line,preparation_date,__lamindb_record_uid__,__lamindb_record_name__\n    # 1,S1,Sample 1,treatment1,HEK293T,2025-06-01 05:00:00,iCwgKgZELoLtIoGy,sample1\n    # 2,S2,Sample 2,treatment2,HEK293T,2025-06-01 06:00:00,qvU9m7VF6fSdsqJs,sample2\n    assert len(artifact.load()) == 2  # two rows in the dataframe\n    assert artifact.path.read_text().startswith(\"\"\"\\\nid,uid,name,treatment,cell_line,preparation_date,project,__lamindb_record_uid__,__lamindb_record_name__\n1,S1,Sample 1,treatment1,HEK293T,2025-06-01 05:00:00,Project 1\"\"\")\n    assert artifact.key == f\"sheet_exports/{sample_sheet1.name}.csv\"\n    assert artifact.description.startswith(f\"Export of sheet {sample_sheet1.uid}\")\n    assert artifact._state.adding is False\n    assert ln.models.ArtifactRecord.filter(artifact=artifact).count() == 2\n    assert artifact.features.describe(return_str=True).endswith(\"\"\"\\\n└── Dataset features\n    └── columns (7)\n        cell_line           bionty.CellLine          HEK293T\n        id                  int\n        name                str\n        preparation_date    datetime\n        project             Project                  Project 1\n        treatment           Record[Treatment]        treatment1, treatment2\n        uid                 str\"\"\")\n    # re-run the export which triggers hash lookup\n    sample_sheet1.to_artifact()\n    # soft-delete a record in the sheet\n    sample_sheet1.records.first().delete()\n    assert ln.Record.filter(type=sample_sheet1).count() == 1\n    df = sample_sheet1.to_dataframe()\n    print(df)\n    assert len(df) == 1  # one row in the dataframe\n\n    artifact.delete(permanent=True)\n\n\ndef test_nextflow_sheet_with_samples(\n    populate_nextflow_sheet_with_samples: ln.Record,  # noqa: F811\n):\n    \"\"\"Test the example fixture for nextflow sheet with samples.\"\"\"\n    # This test is to ensure that the fixture works as expected\n    # and that the data is correctly populated in the database.\n    nextflow_sheet = populate_nextflow_sheet_with_samples\n\n    df = nextflow_sheet.to_dataframe()\n\n    assert df[\n        [\"expected_cells\", \"fastq_1\", \"fastq_2\", \"sample\", \"__lamindb_record_name__\"]\n    ].to_dict(orient=\"list\") == {\n        \"expected_cells\": [\n            5000,\n            5000,\n            5000,\n        ],\n        \"fastq_1\": [\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz\",\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R1_001.fastq.gz\",\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R1_001.fastq.gz\",\n        ],\n        \"fastq_2\": [\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz\",\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R2_001.fastq.gz\",\n            \"https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R2_001.fastq.gz\",\n        ],\n        \"__lamindb_record_name__\": [\n            None,\n            None,\n            None,\n        ],\n        \"sample\": [\n            \"Sample_X\",\n            \"Sample_Y\",\n            \"Sample_Y\",\n        ],\n    }\n\n    assert nextflow_sheet.schema is not None\n    artifact = nextflow_sheet.to_artifact()\n    assert artifact.schema is nextflow_sheet.schema\n    assert artifact._state.adding is False\n    assert set(nextflow_sheet.schema.members.to_list(\"name\")) == {\n        \"sample\",\n        \"fastq_1\",\n        \"fastq_2\",\n        \"expected_cells\",\n        \"seq_center\",\n    }\n    assert set(artifact.features.slots[\"columns\"].members.to_list(\"name\")) == {\n        \"sample\",\n        \"fastq_1\",\n        \"fastq_2\",\n        \"expected_cells\",\n        \"seq_center\",\n    }\n    assert artifact.path.read_text().startswith(\"\"\"\\\nsample,fastq_1,fastq_2,expected_cells,seq_center,__lamindb_record_uid__,__lamindb_record_name__\nSample_X,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz,5000,,\"\"\")\n    assert artifact.features.describe(return_str=True).endswith(\"\"\"\\\n└── Dataset features\n    └── columns (5)\n        expected_cells      int\n        fastq_1             str\n        fastq_2             str\n        sample              Record[BioSample]        Sample_X, Sample_Y\n        seq_center          str\"\"\")\n\n    related_schemas = list(artifact.schemas.all())\n    artifact.schemas.clear()\n    for schema in related_schemas:\n        schema.delete(permanent=True)\n    artifact.delete(permanent=True)\n\n\ndef test_record_soft_deleted_recreate():\n    \"\"\"Test that a soft-deleted record can be recreated with changes.\"\"\"\n    # testing soft delete and recreate with sqlite (postgres is tested in core/test_delete.py)\n    # soft delete a record, then recreate it with some changes\n    record = ln.Record(name=\"test_record\").save()\n    uid = record.uid\n    assert record.branch_id == 1\n    record.delete()\n    assert record.branch_id == -1\n    # now recreate the same record with the same uid but a different name\n    record = ln.Record(name=\"test_record 2\")\n    record.uid = uid\n    record.save()\n    # now this record is recovered from the trash\n    assert record.branch_id == 1\n    assert record.name == \"test_record 2\"\n    ln.Record.objects.filter().delete()\n\n\ndef test_annotate_with_user_feature():\n    \"\"\"Test that annotating with a user feature works as expected.\"\"\"\n    user_feature = ln.Feature(name=\"created_by\", dtype=ln.User).save()\n    schema = ln.Schema(\n        name=\"test_schema_user_feature\",\n        features=[user_feature],\n        coerce=True,\n    ).save()\n    sheet = ln.Record(name=\"A sheet with users\", is_type=True, schema=schema).save()\n    record = ln.Record(name=\"first user\", type=sheet).save()\n    user = ln.User(uid=\"abcdefgh\", handle=\"test-user\").save()\n    ln.models.RecordUser(record=record, feature=user_feature, value=user).save()\n\n    df = sheet.to_dataframe()\n    assert df.index.name == \"__lamindb_record_id__\"\n    assert df.columns.to_list() == [\n        \"created_by\",\n        \"__lamindb_record_uid__\",\n        \"__lamindb_record_name__\",\n    ]\n    assert df.iloc[0][\"created_by\"] == \"test-user\"\n\n    # clean up\n    record.type = None\n    record.save()\n    record.delete(permanent=True)\n    sheet.delete(permanent=True)\n    schema.delete(permanent=True)\n    user_feature.delete(permanent=True)\n    user.delete(permanent=True)\n\n\ndef test_to_artifact_exports_all_records():\n    # create sheet with >100 records, the default limit for to_dataframe\n    sheet = ln.Record(name=\"LargeSheet\", is_type=True).save()\n    for i in range(101):\n        ln.Record(name=f\"record_{i}\", type=sheet).save()\n    df = sheet.to_dataframe()\n    assert len(df) == 101, f\"Expected 101 records, got {len(df)}\"\n    sheet.records.all().delete(permanent=True)\n    sheet.delete(permanent=True)\n\n\ndef test_to_artifact_with_required_non_nullable_data_id_maximal_set_true():\n    feature_data_id = ln.Feature(name=\"data_id\", dtype=str, nullable=False).save()\n    schema = ln.Schema(\n        [feature_data_id],\n        name=\"schema_with_required_data_id\",\n        maximal_set=True,\n    ).save()\n    sheet = ln.Record(name=\"SheetWithDataId\", is_type=True, schema=schema).save()\n    # Name is intentionally omitted to mirror sheet records in real-world pipelines.\n    record = ln.Record(type=sheet).save()\n    record.features.add_values({\"data_id\": \"D1\"})\n\n    artifact = sheet.to_artifact()\n    df = artifact.load()\n    assert \"data_id\" in df.columns\n    assert df[\"data_id\"].to_list() == [\"D1\"]\n    assert \"__lamindb_record_name__\" in df.columns\n    assert df[\"__lamindb_record_name__\"].isna().all()\n\n    # clean up\n    record.delete(permanent=True)\n    sheet.delete(permanent=True)\n    artifact.delete(permanent=True)\n    schema.delete(permanent=True)\n    feature_data_id.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_rename_features_labels.py",
    "content": "import datetime\nimport os\n\nimport lamindb as ln\nimport pandas as pd\nimport pytest\n\n\ndef test_rename_feature(ccaplog):\n    df = pd.DataFrame({\"old_name\": [1, 2]})\n    ln.Feature(name=\"old_name\", dtype=int).save()\n    artifact = ln.Artifact.from_dataframe(\n        df, key=\"test.parquet\", schema=\"valid_features\"\n    ).save()\n    feature = ln.Feature.get(name=\"old_name\")\n\n    # First rename\n    feature.name = \"new_name\"\n    feature.save()\n    now1 = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0)\n    assert (\n        \"by renaming feature from 'old_name' to 'new_name' 1 artifact no longer matches the feature name in storage:\"\n        in ccaplog.text\n    )\n    if os.getenv(\"LAMINDB_TEST_DB_VENDOR\") != \"sqlite\":\n        feature.refresh_from_db()\n        assert feature.synonyms == \"old_name\"\n        assert feature._aux[\"renamed\"] == {\n            now1.isoformat().replace(\"+00:00\", \"Z\"): \"old_name\"\n        }\n\n    # Second rename\n    feature.name = \"newer_name\"\n    feature.save()\n    now2 = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0)\n    assert (\n        \"by renaming feature from 'new_name' to 'newer_name' 1 artifact no longer matches the feature name in storage:\"\n        in ccaplog.text\n    )\n    if os.getenv(\"LAMINDB_TEST_DB_VENDOR\") != \"sqlite\":\n        feature.refresh_from_db()\n        assert feature.synonyms == \"old_name|new_name\"\n        assert feature._aux[\"renamed\"] == {\n            now1.isoformat().replace(\"+00:00\", \"Z\"): \"old_name\",\n            now2.isoformat().replace(\"+00:00\", \"Z\"): \"new_name\",\n        }\n\n    schema = artifact.schemas.first()\n    artifact.delete(permanent=True)\n    schema.delete(permanent=True)\n    feature.delete(permanent=True)\n\n\n@pytest.mark.parametrize(\"model_class\", [ln.ULabel, ln.Record])\ndef test_rename_label(model_class, ccaplog):\n    df = pd.DataFrame(\n        {\n            \"feature1\": pd.Categorical([\"label1\", \"label2\"]),\n            \"feature2\": pd.Categorical([\"label2\", \"label2\"]),\n        }\n    )\n\n    label1 = model_class(name=\"label1\").save()\n    label2 = model_class(name=\"label2\").save()\n    feature1 = ln.Feature(name=\"feature1\", dtype=model_class).save()\n    feature2 = ln.Feature(name=\"feature2\", dtype=model_class).save()\n    artifact = ln.Artifact.from_dataframe(\n        df, key=\"test.parquet\", schema=\"valid_features\"\n    ).save()\n\n    label = model_class.get(name=\"label1\")\n    label.name = \"label-renamed\"\n    label.save()\n\n    assert (\n        \"by renaming label from 'label1' to 'label-renamed' 1 artifact no longer matches the label name in storage:\"\n        in ccaplog.text\n    )\n\n    schema = artifact.schemas.first()\n    artifact.delete(permanent=True)\n    schema.delete(permanent=True)\n    feature1.delete(permanent=True)\n    feature2.delete(permanent=True)\n    label1.delete(permanent=True)\n    label2.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_run.py",
    "content": "import time\n\nimport lamindb as ln\nimport pytest\n\n\ndef test_run():\n    with pytest.raises(ValueError) as error:\n        ln.Run(1, 2)\n    assert error.exconly() == \"ValueError: Only one non-keyword arg allowed: transform\"\n    with pytest.raises(TypeError) as error:\n        ln.Run()\n    assert error.exconly() == \"TypeError: Pass transform parameter\"\n    transform = ln.Transform(key=\"my_transform\")\n    with pytest.raises(ValueError) as error:\n        ln.Run(transform)\n    assert (\n        error.exconly()\n        == \"ValueError: Please save transform record before creating a run\"\n    )\n    transform.save()\n    run = ln.Run(transform).save()\n    assert run.status == \"scheduled\"\n    assert run.reference is None\n    assert run.reference_type is None\n    run2 = ln.Run(transform, reference=\"test1\", reference_type=\"test2\").save()\n    assert run2.reference == \"test1\"\n    assert run2.reference_type == \"test2\"\n    assert run.uid != run2.uid\n    run.delete(permanent=True)\n\n    report_artifact = ln.Artifact(\n        \"README.md\", kind=\"__lamindb_run__\", description=\"report of run2\"\n    ).save()\n    run2.report = report_artifact\n    environment = ln.Artifact(\n        \"CONTRIBUTING.md\", kind=\"__lamindb_run__\", description=\"requirements.txt\"\n    ).save()\n    run2.environment = environment\n    run2.save()\n\n    # report/env artifacts will be cleaned up in background subprocess\n    run2.delete(permanent=True)\n    assert ln.Run.filter(uid=run2.uid).count() == 0\n    # report/env are still present in the database\n    assert ln.Artifact.filter(uid=report_artifact.uid).count() == 1\n    assert ln.Artifact.filter(uid=environment.uid).count() == 1\n\n    transform.delete(permanent=True)\n    assert ln.Run.filter(uid=run.uid).count() == 0\n\n    # wait for background cleanup subprocess to delete artifacts\n    time.sleep(4)\n    assert ln.Artifact.filter(uid=report_artifact.uid).count() == 0\n    assert ln.Artifact.filter(uid=environment.uid).count() == 0\n\n\ndef test_bulk_permanent_run_delete(tmp_path):\n    transform = ln.Transform(key=\"Bulk run delete transform\").save()\n    n_runs = 2\n    report_files = [tmp_path / f\"report_{i}.txt\" for i in range(n_runs)]\n    for i, path in enumerate(report_files):\n        path.write_text(f\"content {i}\")\n    report_artifacts = [\n        ln.Artifact(path, kind=\"__lamindb_run__\", description=f\"report {i}\").save()\n        for i, path in enumerate(report_files)\n    ]\n    runs = [ln.Run(transform, report=af).save() for af in report_artifacts]\n    run_ids = [r.id for r in runs]\n    ln.settings.verbosity = \"debug\"\n    ln.Run.filter(id__in=run_ids).order_by(\"created_at\").delete(permanent=True)\n    assert ln.Run.filter(id__in=run_ids).count() == 0\n    assert ln.Artifact.filter(uid=report_artifacts[0].uid).count() == 1\n    transform.delete(permanent=True)\n\n    # wait for background cleanup subprocess to delete artifacts\n    time.sleep(4)\n    assert ln.Artifact.filter(uid=report_artifacts[0].uid).count() == 0\n    clean_up_logs = ln.setup.settings.cache_dir / f\"run_cleanup_logs_{runs[0].uid}.txt\"\n    assert f\"deleted artifact {report_artifacts[0].id}\" in clean_up_logs.read_text()\n"
  },
  {
    "path": "tests/core/test_save.py",
    "content": "# ruff: noqa: F811\n\nimport lamindb as ln\nimport pytest\nfrom _dataset_fixtures import (  # noqa\n    get_mini_csv,\n)\nfrom lamindb.models.save import prepare_error_message, store_artifacts\n\n\ndef test_bulk_save_and_update():\n    label_names = [f\"Record {i} new\" for i in range(3)]\n    labels = [ln.Record(name=name) for name in label_names]\n    # test bulk creation of new records\n    ln.save(labels)\n    assert len(ln.Record.filter(name__in=label_names).distinct()) == 3\n    labels[0].name = \"Record 0 updated\"\n    # test bulk update of existing records\n    ln.save(labels)\n    assert len(ln.Record.filter(name__in=label_names).distinct()) == 2\n    assert ln.Record.get(name=\"Record 0 updated\")\n\n\ndef test_prepare_error_message(get_mini_csv):\n    artifact = ln.Artifact(get_mini_csv, description=\"test\")\n    exception = Exception(\"exception\")\n\n    error = prepare_error_message([], [artifact], exception)\n    assert error.startswith(\n        \"The following entries have been successfully uploaded and committed to the database\"\n    )\n\n    error = prepare_error_message([artifact], [], exception)\n    assert error.startswith(\"No entries were uploaded or committed to the database\")\n\n\ndef test_save_data_object(get_mini_csv):\n    artifact = ln.Artifact(get_mini_csv, description=\"test\")\n    artifact.save()\n    assert artifact.path.exists()\n    artifact.delete(permanent=True, storage=True)\n\n\ndef test_store_artifacts_acid(get_mini_csv):\n    artifact = ln.Artifact(get_mini_csv, description=\"test\")\n    artifact._clear_storagekey = \"test.csv\"\n    # errors on check_and_attempt_clearing\n    with pytest.raises(FileNotFoundError):\n        artifact.save()\n\n    with pytest.raises(RuntimeError) as error:\n        store_artifacts([artifact], using_key=None)\n    assert str(error.exconly()).startswith(\n        \"RuntimeError: The following entries have been successfully uploaded\"\n    )\n\n    artifact.delete(permanent=True)\n\n\ndef test_save_parents():\n    import bionty as bt\n\n    bt.CellType.from_values([\"B cell\", \"T cell\"]).save()\n    assert bt.CellType.get(name=\"B cell\").parents.to_dataframe().shape[0] == 1\n    bt.CellType.filter().delete(permanent=True)\n\n\ndef test_save_batch_size():\n    label_names = [f\"Record {i} batch_size\" for i in range(3)]\n    labels = [ln.Record(name=name) for name in label_names]\n    # test bulk creation of new records with batch size\n    ln.save(labels, batch_size=2)\n    assert ln.Record.filter(name__in=label_names).distinct().count() == 3\n\n\ndef test_bulk_save_lazy_record_features():\n    cell_type = ln.Record(name=\"lazy-cell-type\", is_type=True).save()\n    ln.Record(name=\"lazy-b-cell\", type=cell_type).save()\n    ln.Record(name=\"lazy-t-cell\", type=cell_type).save()\n    score_feature = ln.Feature(name=\"lazy-bulk-score\", dtype=float).save()\n    cell_feature = ln.Feature(name=\"lazy-bulk-cell\", dtype=cell_type).save()\n    schema = ln.Schema([score_feature, cell_feature], name=\"lazy-bulk-schema\").save()\n    sheet = ln.Record(name=\"lazy-sheet\", is_type=True, schema=schema).save()\n\n    records = [\n        ln.Record(\n            name=\"lazy-sample-1\",\n            type=sheet,\n            features={\"lazy-bulk-score\": 0.1, \"lazy-bulk-cell\": \"lazy-b-cell\"},\n        ),\n        ln.Record(\n            name=\"lazy-sample-2\",\n            type=sheet,\n            features={\"lazy-bulk-score\": 0.2, \"lazy-bulk-cell\": \"lazy-t-cell\"},\n        ),\n    ]\n    ln.save(records)\n\n    sample_1 = ln.Record.get(name=\"lazy-sample-1\")\n    sample_2 = ln.Record.get(name=\"lazy-sample-2\")\n    sample_1_values = sample_1.features.get_values()\n    sample_2_values = sample_2.features.get_values()\n    assert sample_1_values[\"lazy-bulk-score\"] == 0.1\n    assert sample_2_values[\"lazy-bulk-score\"] == 0.2\n    assert sample_1_values[\"lazy-bulk-cell\"] == \"lazy-b-cell\"\n    assert sample_2_values[\"lazy-bulk-cell\"] == \"lazy-t-cell\"\n    assert not hasattr(records[0], \"_features\")\n    assert not hasattr(records[1], \"_features\")\n\n    ln.Record.filter(name__in=[\"lazy-sample-1\", \"lazy-sample-2\"]).delete(permanent=True)\n    ln.Record.filter(name=\"lazy-sheet\").delete(permanent=True)\n    ln.Record.filter(name__in=[\"lazy-b-cell\", \"lazy-t-cell\"]).delete(permanent=True)\n    ln.Record.filter(name=\"lazy-cell-type\").delete(permanent=True)\n    schema.delete(permanent=True)\n    score_feature.delete(permanent=True)\n    cell_feature.delete(permanent=True)\n\n\ndef test_bulk_save_lazy_record_features_requires_same_schema():\n    feature_a = ln.Feature(name=\"lazy-schema-a\", dtype=float).save()\n    feature_b = ln.Feature(name=\"lazy-schema-b\", dtype=float).save()\n    schema_a = ln.Schema([feature_a], name=\"lazy-schema-a\").save()\n    schema_b = ln.Schema([feature_b], name=\"lazy-schema-b\").save()\n    type_a = ln.Record(name=\"lazy-type-a\", is_type=True, schema=schema_a).save()\n    type_b = ln.Record(name=\"lazy-type-b\", is_type=True, schema=schema_b).save()\n\n    records = [\n        ln.Record(name=\"lazy-mixed-1\", type=type_a, features={\"lazy-schema-a\": 1.0}),\n        ln.Record(name=\"lazy-mixed-2\", type=type_b, features={\"lazy-schema-b\": 2.0}),\n    ]\n    with pytest.raises(\n        ln.errors.ValidationError,\n        match=\"same type schema\",\n    ):\n        ln.save(records)\n\n    ln.Record.filter(name__in=[\"lazy-mixed-1\", \"lazy-mixed-2\"]).delete(permanent=True)\n    ln.Record.filter(name__in=[\"lazy-type-a\", \"lazy-type-b\"]).delete(permanent=True)\n    schema_a.delete(permanent=True)\n    schema_b.delete(permanent=True)\n    feature_a.delete(permanent=True)\n    feature_b.delete(permanent=True)\n\n\ndef test_bulk_save_lazy_record_features_requires_schema():\n    unschematized_type = ln.Record(name=\"lazy-no-schema-type\", is_type=True).save()\n\n    records = [\n        ln.Record(\n            name=\"lazy-no-schema-1\", type=unschematized_type, features={\"foo\": 1.0}\n        )\n    ]\n    with pytest.raises(\n        ln.errors.ValidationError,\n        match=\"same non-null type schema\",\n    ):\n        ln.save(records)\n\n    ln.Record.filter(name=\"lazy-no-schema-1\").delete(permanent=True)\n    ln.Record.filter(name=\"lazy-no-schema-type\").delete(permanent=True)\n\n\ndef test_bulk_resave_trashed_records():\n    import bionty as bt\n\n    # first create records from public source\n    records = bt.Ethnicity.from_values([\"asian\", \"white\"]).save()\n    assert len(records) == 2\n    # parents are also created\n    ethnicities = bt.Ethnicity.filter()\n    assert ethnicities.count() > 2\n    # soft delete the records including parent\n    ethnicities.delete()\n    # then create them again from public source\n    # the new records will now have the same uids as they are hashed from the ontology_ids\n    assert bt.Ethnicity.filter().count() == 0\n    new_records = bt.Ethnicity.from_values([\"asian\", \"white\", \"african\"])\n    assert new_records[0].branch_id == 1\n    assert new_records[0].uid == records[0].uid\n    # after saving, the trashed records should be restored\n    new_records.save()\n    assert new_records[0].branch_id == 1\n    ethnicities = bt.Ethnicity.filter()\n    # the parent should also be restored\n    assert ethnicities.count() > 3\n\n    # clean up\n    ethnicities.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_schema.py",
    "content": "import bionty as bt\nimport lamindb as ln\nimport pandas as pd\nimport pytest\nfrom django.db.utils import IntegrityError\nfrom lamindb.errors import FieldValidationError, InvalidArgument, ValidationError\nfrom lamindb.models.schema import get_related_name, validate_features\n\n\n@pytest.fixture(scope=\"module\")\ndef df():\n    return pd.DataFrame(\n        {\n            \"feat1\": [1, 2, 3],\n            \"feat2\": [3, 4, 5],\n            \"feat3\": [\"cond1\", \"cond2\", \"cond2\"],\n            \"feat4\": [\"id1\", \"id2\", \"id3\"],\n        }\n    )\n\n\ndef test_schema_from_values():\n    gene_symbols = [\"TCF7\", \"MYC\"]\n    bt.Gene.filter(symbol__in=gene_symbols).delete(permanent=True)\n    with pytest.raises(ValidationError) as error:\n        schema = ln.Schema.from_values(\n            gene_symbols, bt.Gene.symbol, dtype=int, organism=\"human\"\n        )\n    assert error.exconly().startswith(\n        \"lamindb.errors.ValidationError: These values could not be validated:\"\n    )\n    ln.save(bt.Gene.from_values(gene_symbols, \"symbol\", organism=\"human\"))\n    schema = ln.Schema.from_values(gene_symbols, bt.Gene.symbol, organism=\"human\")\n    # below should be a queryset and not a list\n    assert set(schema.members) == set(\n        bt.Gene.from_values(gene_symbols, \"symbol\", organism=\"human\")\n    )\n    assert schema.dtype == \"num\"  # this is NUMBER_TYPE\n    schema = ln.Schema.from_values(\n        gene_symbols, bt.Gene.symbol, dtype=int, organism=\"human\"\n    )\n    assert schema._state.adding\n    assert schema.dtype == \"int\"\n    assert schema.itype == \"bionty.Gene\"\n    schema.save()\n    assert set(schema.members) == set(schema.genes.all())\n    id = schema.id\n    # test that the schema is retrieved from the database\n    # in case it already exists\n    schema = ln.Schema.from_values(\n        gene_symbols, bt.Gene.symbol, dtype=int, organism=\"human\"\n    )\n    assert not schema._state.adding\n    assert id == schema.id\n    schema.delete(permanent=True)\n\n    # edge cases\n    with pytest.raises(ValueError):\n        schema = ln.Schema.from_values([])\n    with pytest.raises(TypeError):\n        ln.Schema.from_values([\"a\"], field=\"name\")\n    with pytest.raises(ValidationError):\n        schema = ln.Schema.from_values(\n            [\"weird_name\"], field=ln.Feature.name, dtype=\"float\"\n        )\n\n\ndef test_schema_from_records(df):\n    features = ln.Feature.from_dataframe(df)\n    with pytest.raises(ValueError) as error:\n        schema = ln.Schema(features)\n    assert (\n        error.exconly()\n        == \"ValueError: Can only construct feature sets from validated features\"\n    )\n\n    ln.save(features)\n    schema = ln.Schema(features)\n    assert schema.id is None\n    assert schema._state.adding\n    assert schema.dtype is None\n    assert schema.itype == \"Feature\"\n    schema.save()\n    # test that the schema is retrieved from the database\n    # in case it already exists\n    schema = ln.Schema(features)\n    assert not schema._state.adding\n    assert schema.id is not None\n    schema.delete(permanent=True)\n\n    # edge case\n    with pytest.raises(ValueError):\n        positional_arg = 1\n        ln.Schema(features, positional_arg)\n\n\ndef test_schema_from_df(df):\n    # test using type\n    human = bt.Organism.from_source(name=\"human\").save()\n    genes = [bt.Gene(symbol=name, organism=human) for name in df.columns]\n    ln.save(genes)\n    with pytest.raises(ValueError) as error:\n        ln.Schema.from_dataframe(df, field=bt.Gene.symbol)\n    assert error.exconly().startswith(\"ValueError: data types are heterogeneous:\")\n    schema = ln.Schema.from_dataframe(df[[\"feat1\", \"feat2\"]], field=bt.Gene.symbol)\n    for gene in genes:\n        gene.delete(permanent=True)\n\n    # now for the features registry\n    features = ln.Feature.from_dataframe(df)\n    ln.save(features)\n    schema = ln.Schema.from_dataframe(df).save()\n    assert schema.dtype is None\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\ndef test_get_related_name():\n    with pytest.raises(ValueError):\n        get_related_name(ln.Transform)\n\n\ndef test_validate_features():\n    with pytest.raises(ValueError):\n        validate_features([])\n    with pytest.raises(TypeError):\n        validate_features([\"feature\"])\n    with pytest.raises(TypeError):\n        validate_features({\"feature\"})\n    transform = ln.Transform(key=\"test\").save()\n    # This is just a type check\n    with pytest.raises(TypeError) as error:\n        validate_features([transform, ln.Run(transform)])\n    assert error.exconly() == \"TypeError: schema can only contain a single type\"\n    transform.delete(permanent=True)\n\n\ndef test_kwargs():\n    with pytest.raises(FieldValidationError):\n        ln.Schema(x=\"1\", features=[])\n\n\ndef test_edge_cases():\n    feature = ln.Feature(name=\"rna\", dtype=\"float\")\n    ln.save([feature])\n    with pytest.raises(ValueError) as error:\n        ln.Schema(feature)\n    assert (\n        error.exconly()\n        == \"ValueError: Please pass a ListLike of features, not a single feature\"\n    )\n    feature.delete(permanent=True)\n\n\n@pytest.fixture(scope=\"module\")\ndef mini_immuno_schema_flexible():\n    schema = ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible()\n\n    yield schema\n\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n    bt.Gene.filter().delete(permanent=True)\n    ln.Record.filter(type__isnull=False).delete(permanent=True)\n    ln.Record.filter().delete(permanent=True)\n    bt.CellType.filter().delete(permanent=True)\n\n\ndef test_schema_update_implicit_through_name_equality(\n    mini_immuno_schema_flexible: ln.Schema,\n    ccaplog,\n):\n    df = pd.DataFrame({\"a\": [1]})\n    artifact = ln.Artifact.from_dataframe(df, key=\"test_artifact.parquet\").save()\n    artifact.schema = mini_immuno_schema_flexible\n    artifact.save()\n\n    orig_hash = mini_immuno_schema_flexible.hash\n    warning_message = \"you updated the schema hash and might invalidate datasets that were previously validated with this schema:\"\n\n    # different numbers of features -------------------------------------------\n\n    schema = ln.Schema(\n        name=\"Mini immuno schema\",\n        features=[\n            ln.Feature.get(name=\"perturbation\"),\n            ln.Feature.get(name=\"donor\"),\n        ],\n    ).save()\n\n    assert schema.hash != orig_hash\n    assert ccaplog.text.count(warning_message) == 1\n\n    # change is flexible (an auxiliary field) --------------------------------\n\n    schema = ln.Schema(\n        name=\"Mini immuno schema\",\n        features=[\n            ln.Feature.get(name=\"perturbation\"),\n            ln.Feature.get(name=\"cell_type_by_model\"),\n            ln.Feature.get(name=\"assay_oid\"),\n            ln.Feature.get(name=\"donor\"),\n            ln.Feature.get(name=\"concentration\"),\n            ln.Feature.get(name=\"treatment_time_h\"),\n        ],\n        flexible=True,\n    ).save()\n\n    assert schema.hash == orig_hash  # restored original hash\n    assert ccaplog.text.count(warning_message) == 2  # warning raised\n\n    schema = ln.Schema(\n        name=\"Mini immuno schema\",\n        features=[\n            ln.Feature.get(name=\"perturbation\"),\n            ln.Feature.get(name=\"cell_type_by_model\"),\n            ln.Feature.get(name=\"assay_oid\"),\n            ln.Feature.get(name=\"donor\"),\n            ln.Feature.get(name=\"concentration\"),\n            ln.Feature.get(name=\"treatment_time_h\"),\n        ],\n        flexible=False,\n    ).save()\n\n    assert schema.hash != orig_hash\n    assert ccaplog.text.count(warning_message) == 3  # warning raised\n    ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible()\n\n    artifact.delete(permanent=True)\n\n    # restore original hash  --------------------------------\n\n    schema = ln.Schema(\n        name=\"Mini immuno schema\",\n        features=[\n            ln.Feature.get(name=\"perturbation\"),\n            ln.Feature.get(name=\"cell_type_by_model\"),\n            ln.Feature.get(name=\"assay_oid\"),\n            ln.Feature.get(name=\"donor\"),\n            ln.Feature.get(name=\"concentration\"),\n            ln.Feature.get(name=\"treatment_time_h\"),\n        ],\n        flexible=True,\n    ).save()\n\n    assert schema.hash == orig_hash  # restored original hash\n\n\ndef test_schema_update(\n    mini_immuno_schema_flexible: ln.Schema,\n    ccaplog,\n):\n    df = pd.DataFrame({\"a\": [1]})\n    artifact = ln.Artifact.from_dataframe(df, key=\"test_artifact.parquet\").save()\n    artifact.schema = mini_immuno_schema_flexible\n    artifact.save()\n\n    # store original hash\n\n    orig_hash = mini_immuno_schema_flexible.hash\n    warning_message = \"you updated the schema hash and might invalidate datasets that were previously validated with this schema:\"\n\n    # add a feature -------------------------------------------\n\n    feature_to_add = ln.Feature(name=\"sample_note\", dtype=str).save()\n    assert mini_immuno_schema_flexible.n_members == 6\n    mini_immuno_schema_flexible.features.add(feature_to_add)\n    mini_immuno_schema_flexible.save()\n    assert mini_immuno_schema_flexible.hash != orig_hash\n    assert mini_immuno_schema_flexible.n_members == 7\n    assert ccaplog.text.count(warning_message) == 1\n\n    # remove the feature again\n    mini_immuno_schema_flexible.features.remove(feature_to_add)\n    mini_immuno_schema_flexible.save()\n    assert mini_immuno_schema_flexible.hash == orig_hash\n    assert ccaplog.text.count(warning_message) == 2\n    assert mini_immuno_schema_flexible.n_members == 6\n    feature_to_add.delete(permanent=True)\n\n    # change is flexible (an auxiliary field) --------------------------------\n\n    assert mini_immuno_schema_flexible.flexible\n    mini_immuno_schema_flexible.flexible = False\n    mini_immuno_schema_flexible.save()\n    assert mini_immuno_schema_flexible.hash != orig_hash\n    assert ccaplog.text.count(warning_message) == 3\n\n    # restore original setting\n    mini_immuno_schema_flexible.flexible = True\n    mini_immuno_schema_flexible.save()\n    assert mini_immuno_schema_flexible.hash == orig_hash\n    assert ccaplog.text.count(warning_message) == 4\n\n    # change coerce (formerly auxiliary field, now Django field) --------------------------------\n\n    assert not mini_immuno_schema_flexible.coerce\n    mini_immuno_schema_flexible.coerce = True\n    mini_immuno_schema_flexible.save()\n    assert mini_immuno_schema_flexible.hash != orig_hash\n    assert ccaplog.text.count(warning_message) == 5\n\n    # restore original setting\n    mini_immuno_schema_flexible.coerce = False\n    mini_immuno_schema_flexible.save()\n    assert mini_immuno_schema_flexible.hash == orig_hash\n    assert ccaplog.text.count(warning_message) == 6\n\n    # add an index --------------------------------\n\n    index_feature = ln.Feature(name=\"immuno_sample\", dtype=str).save()\n    mini_immuno_schema_flexible.index = index_feature\n    mini_immuno_schema_flexible.save()\n    assert mini_immuno_schema_flexible.hash != orig_hash\n    assert mini_immuno_schema_flexible.n_members == 7\n    assert ccaplog.text.count(warning_message) == 7\n\n    # remove the index\n    mini_immuno_schema_flexible.index = None\n    mini_immuno_schema_flexible.save()\n    assert mini_immuno_schema_flexible.n_members == 6\n    assert mini_immuno_schema_flexible.hash == orig_hash\n    assert ccaplog.text.count(warning_message) == 8\n    index_feature.delete(permanent=True)\n\n    # make a feature optional --------------------------------\n\n    required_feature = mini_immuno_schema_flexible.features.first()\n    mini_immuno_schema_flexible.optionals.add(required_feature)\n    mini_immuno_schema_flexible.save()\n    assert mini_immuno_schema_flexible.hash != orig_hash\n    assert ccaplog.text.count(warning_message) == 9\n\n    # make it required again\n    mini_immuno_schema_flexible.optionals.remove(required_feature)\n    mini_immuno_schema_flexible.save()\n    assert mini_immuno_schema_flexible.hash == orig_hash\n    assert ccaplog.text.count(warning_message) == 10\n\n    artifact.delete(permanent=True)\n\n\ndef test_schema_mutations_feature_removal(\n    mini_immuno_schema_flexible: ln.Schema, ccaplog\n):\n    feature1 = ln.Feature.get(name=\"perturbation\")\n    feature2 = ln.Feature.get(name=\"cell_type_by_model\")\n    dummy_artifact = ln.Artifact(\".gitignore\", key=\".gitignore\").save()\n    # define the schema the first time\n    schema = ln.Schema(name=\"My test schema X\", features=[feature1, feature2]).save()\n    assert schema.features.count() == 2\n    dummy_artifact.schema = schema  # pretend artifact was validated with this schema\n    dummy_artifact.save()\n    # define the schema the first time\n    schema1 = ln.Schema(name=\"My test schema X\", features=[feature2]).save()\n    # retrieves same schema because of name equality\n    assert ccaplog.text.count(\"you're removing these features:\") == 1\n    assert (\n        ccaplog.text.count(\"you updated the schema hash and might invalidate datasets\")\n        == 1\n    )\n    assert schema1 == schema\n    assert schema1.features.count() == 1\n    dummy_artifact.delete(permanent=True)\n    schema.delete(permanent=True)\n\n\ndef test_schema_add_remove_optional_features(mini_immuno_schema_flexible: ln.Schema):\n    schema = mini_immuno_schema_flexible\n    initial_hash = schema.hash\n    feature_project = ln.Feature(name=\"project\", dtype=ln.Project).save()\n    schema.add_optional_features([feature_project])\n    assert schema.hash != initial_hash\n    schema.remove_optional_features([feature_project])\n    assert schema.hash == initial_hash\n\n\ndef test_schema_components(mini_immuno_schema_flexible: ln.Schema):\n    obs_schema = mini_immuno_schema_flexible\n    var_schema = ln.Schema(\n        name=\"scRNA_seq_var_schema\",\n        itype=bt.Gene.ensembl_gene_id,\n        dtype=\"num\",\n    ).save()\n\n    # test recreation of schema based on name lookup\n    var_schema2 = ln.Schema(\n        name=\"scRNA_seq_var_schema\",\n        itype=bt.Gene.ensembl_gene_id,\n        dtype=\"num\",\n    ).save()\n    assert var_schema == var_schema2\n\n    with pytest.raises(InvalidArgument) as error:\n        ln.Schema(\n            name=\"mini_immuno_anndata_schema\",\n            slots={\"obs\": obs_schema, \"var\": var_schema},\n        ).save()\n    assert str(error.value) == \"Please pass otype != None for composite schemas\"\n\n    anndata_schema = ln.Schema(\n        name=\"mini_immuno_anndata_schema\",\n        otype=\"AnnData\",\n        slots={\"obs\": obs_schema, \"var\": var_schema},\n    ).save()\n\n    var_schema2 = ln.Schema(\n        name=\"symbol_var_schema\",\n        itype=bt.Gene.symbol,\n        dtype=\"num\",\n    ).save()\n    # try adding another schema under slot \"var\"\n    # we want to trigger the unique constraint on slot\n    with pytest.raises(IntegrityError) as error:\n        anndata_schema.components.add(  # type: ignore\n            var_schema2, through_defaults={\"slot\": \"var\"}\n        )\n    assert \"unique\" in str(error.value).lower()\n\n    anndata_schema.delete(permanent=True)\n    var_schema2.delete(permanent=True)\n    var_schema.delete(permanent=True)\n\n\ndef test_mini_immuno_schema_flexible(mini_immuno_schema_flexible):\n    schema = ln.Schema(\n        name=\"Mini immuno schema\",\n        features=[\n            ln.Feature.get(name=\"perturbation\"),\n            ln.Feature.get(name=\"cell_type_by_model\"),\n            ln.Feature.get(name=\"assay_oid\"),\n            ln.Feature.get(name=\"donor\"),\n            ln.Feature.get(name=\"concentration\"),\n            ln.Feature.get(name=\"treatment_time_h\"),\n        ],\n        flexible=True,  # _additional_ columns in a dataframe are validated & annotated\n    )\n    assert schema.name == \"Mini immuno schema\"\n    assert schema.itype == \"Feature\"\n    assert (\n        schema._list_for_hashing[:6]\n        == [\n            \"b=Feature\",\n            \"c=True\",\n            \"d=False\",\n            \"e=False\",\n            \"f=True\",\n            \"h=6\",\n            \"j=HASH_OF_FEATURE_UIDS\",  # this last hash is not deterministic in a unit test\n        ][:6]\n    )\n\n\ndef test_schema_recovery_based_on_hash(mini_immuno_schema_flexible: ln.Schema):\n    feature1 = ln.Feature.get(name=\"perturbation\")\n    feature2 = ln.Feature.get(name=\"cell_type_by_model\")\n    schema = ln.Schema(features=[feature1, feature2]).save()\n    schema2 = ln.Schema(features=[feature1, feature2])\n    assert schema == schema2\n    schema.delete()\n    schema2 = ln.Schema(features=[feature1, feature2])\n    assert schema != schema2\n    schema.delete(permanent=True)\n\n\ndef test_schemas_dataframe():\n    # test on the Python level after record creation -- no saving!\n    schema = ln.Schema(name=\"valid_features\", itype=ln.Feature)\n    assert schema.name == \"valid_features\"\n    assert schema.itype == \"Feature\"\n    assert schema._list_for_hashing == [\n        \"b=Feature\",\n        \"c=True\",\n        \"d=False\",\n        \"e=False\",\n    ]\n    assert schema.hash == \"kMi7B_N88uu-YnbTLDU-DA\"\n\n    # test the convenience function\n    schema = ln.examples.schemas.valid_features()\n    assert schema.uid == \"0000000000000000\"\n    assert schema.name == \"valid_features\"\n    assert schema.itype == \"Feature\"\n    assert schema.hash == \"kMi7B_N88uu-YnbTLDU-DA\"\n\n\ndef test_schemas_anndata():\n    # test on the Python level after record creation -- no saving!\n    obs_schema = ln.examples.schemas.valid_features()\n    varT_schema = ln.Schema(\n        name=\"valid_ensembl_gene_ids\", itype=bt.Gene.ensembl_gene_id\n    )\n    assert varT_schema._list_for_hashing == [\n        \"a=num\",\n        \"b=bionty.Gene.ensembl_gene_id\",\n        \"c=True\",\n        \"d=False\",\n        \"e=False\",\n    ]\n    assert varT_schema.name == \"valid_ensembl_gene_ids\"\n    assert varT_schema.itype == \"bionty.Gene.ensembl_gene_id\"\n    assert varT_schema.hash == \"1gocc_TJ1RU2bMwDRK-WUA\"\n    schema = ln.Schema(\n        name=\"anndata_ensembl_gene_ids_and_valid_features_in_obs\",\n        otype=\"AnnData\",\n        slots={\"obs\": obs_schema, \"var.T\": varT_schema.save()},\n    )\n    assert schema._list_for_hashing == [\n        \"a=num\",\n        \"c=True\",\n        \"d=False\",\n        \"e=False\",\n        \"l=GPZ-TzvKRhdC1PQAhlFiow\",\n    ]\n    assert schema.name == \"anndata_ensembl_gene_ids_and_valid_features_in_obs\"\n    assert schema.itype is None\n    assert schema.hash == \"aqGWHvyY49W_PHELUMiBMw\"\n\n    # test the convenience function\n    schema = ln.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs()\n    assert schema.uid == \"0000000000000002\"\n    assert schema.name == \"anndata_ensembl_gene_ids_and_valid_features_in_obs\"\n    assert schema.itype is None\n    assert schema.hash == \"aqGWHvyY49W_PHELUMiBMw\"\n    varT_schema = schema.slots[\"var.T\"]\n    assert varT_schema.uid == \"0000000000000001\"\n    assert varT_schema.name == \"valid_ensembl_gene_ids\"\n    assert varT_schema.itype == \"bionty.Gene.ensembl_gene_id\"\n    assert varT_schema.hash == \"1gocc_TJ1RU2bMwDRK-WUA\"\n\n    schema.delete(permanent=True)\n\n\ndef test_schema_already_saved_aux():\n    \"\"\"When attempting to save a Schema that was already saved before which populated `_aux` fields,\n    we expect the Schema to be returned with the same `_aux` fields.\n\n    Test for https://github.com/laminlabs/lamindb/issues/2887\n    \"\"\"\n    var_schema = ln.Schema(\n        name=\"test var\",\n        index=ln.Feature(\n            name=\"var_index\",\n            dtype=bt.Gene.ensembl_gene_id,\n            cat_filters={\n                \"source\": bt.Source.get(\n                    entity=\"bionty.Gene\", currently_used=True, organism=\"human\"\n                )\n            },\n        ).save(),\n        itype=ln.Feature,\n        dtype=\"DataFrame\",\n        minimal_set=True,\n        coerce=True,\n    ).save()\n\n    schema = ln.Schema(\n        name=\"AnnData schema\",\n        otype=\"AnnData\",\n        minimal_set=True,\n        coerce=True,\n        slots={\"var\": var_schema},\n    ).save()\n\n    # _aux[\"af\"] now only contains key \"3\" (index_feature_uid) since coerce and flexible are Django fields\n    assert len(schema.slots[\"var\"]._aux[\"af\"].keys()) == 1\n    assert \"3\" in schema.slots[\"var\"]._aux[\"af\"]  # index_feature_uid\n    # coerce and flexible are now proper Django fields\n    assert schema.slots[\"var\"].coerce is True\n    assert schema.slots[\"var\"].flexible is False\n\n    # Attempting to save the same schema again should return the Schema with the same fields\n    var_schema_2 = ln.Schema(\n        name=\"test var\",\n        index=ln.Feature(\n            name=\"var_index\",\n            dtype=bt.Gene.ensembl_gene_id,\n            cat_filters={\n                \"source\": bt.Source.get(\n                    entity=\"bionty.Gene\", currently_used=True, organism=\"human\"\n                )\n            },\n        ).save(),\n        itype=ln.Feature,\n        dtype=\"DataFrame\",\n        minimal_set=True,\n        coerce=True,\n    ).save()\n\n    schema_2 = ln.Schema(\n        name=\"AnnData schema\",\n        otype=\"AnnData\",\n        minimal_set=True,\n        coerce=True,\n        slots={\"var\": var_schema_2},\n    ).save()\n\n    assert len(schema.slots[\"var\"]._aux[\"af\"].keys()) == 1\n    assert schema.slots[\"var\"]._aux == schema_2.slots[\"var\"]._aux\n    assert schema.slots[\"var\"].coerce == schema_2.slots[\"var\"].coerce\n    assert schema.slots[\"var\"].flexible == schema_2.slots[\"var\"].flexible\n\n    schema_2.delete(permanent=True)\n    schema.delete(permanent=True)\n\n\ndef test_schema_not_saved_describe():\n    schema = ln.Schema(name=\"NotSavedSchema\", is_type=True)\n    with pytest.raises(ValueError) as e:\n        schema.describe()\n    assert \"Schema must be saved before describing\" in str(e.value)\n\n\ndef test_schema_is_type():\n    Sample = ln.Schema(name=\"Sample\", is_type=True).save()\n    assert Sample.hash is None\n    BioSample = ln.Schema(name=\"BioSample\", is_type=True, type=Sample).save()\n    assert BioSample.hash is None\n    assert BioSample.type == Sample\n    assert BioSample.is_type\n    # create a schema without any features or slots or itype or is_type=True\n    with pytest.raises(InvalidArgument) as e:\n        ln.Schema(name=\"TechSample\", type=Sample)\n    assert \"Please pass features or slots or itype or set is_type=True\" in str(e.value)\n    # clean up\n    BioSample.delete(permanent=True)\n    Sample.delete(permanent=True)\n\n\n# see test_component_composite in test_transform.py\ndef test_composite_component():\n    composite = ln.Schema(name=\"composite\", itype=ln.Feature).save()\n    component1 = ln.Schema(name=\"component1\", itype=bt.CellType).save()\n    component2 = ln.Schema(name=\"component2\", itype=bt.CellMarker).save()\n    composite.components.add(component1, through_defaults={\"slot\": \"slot1\"})\n    composite.components.add(component2, through_defaults={\"slot\": \"slot2\"})\n\n    assert len(composite.components.all()) == 2\n    assert composite.links_component.count() == 2\n    assert set(composite.links_component.all().to_list(\"slot\")) == {\"slot1\", \"slot2\"}\n    assert composite.links_component.first().composite == composite\n    assert composite.composites.count() == 0\n    assert composite.links_composite.count() == 0\n\n    ln.models.SchemaComponent.filter(composite=composite).delete(permanent=True)\n\n    link = ln.models.SchemaComponent(\n        composite=composite, component=component1, slot=\"var\"\n    ).save()\n    assert link in composite.links_component.all()\n    assert link in component1.links_composite.all()\n    assert link.slot == \"var\"\n\n    composite.delete(permanent=True)\n    component1.delete(permanent=True)\n    component2.delete(permanent=True)\n\n    assert ln.models.SchemaComponent.filter().count() == 0\n\n\ndef test_schema_describe_bracket_names():\n    \"\"\"Feature names with brackets like 'characteristics[organism]' must appear verbatim in describe output.\n\n    Regression test for Rich interpreting '[...]' as markup tags and swallowing bracket content.\n    \"\"\"\n    features = [\n        ln.Feature(name=\"source name\", dtype=\"str\").save(),\n        ln.Feature(name=\"characteristics[organism]\", dtype=\"str\").save(),\n        ln.Feature(name=\"characteristics[disease]\", dtype=\"str\").save(),\n        ln.Feature(name=\"comment[instrument]\", dtype=\"str\").save(),\n    ]\n    schema = ln.Schema(features, name=\"test_brackets\").save()\n    result = schema.describe(return_str=True)\n    assert \"characteristics[organism]\" in result\n    assert \"characteristics[disease]\" in result\n    assert \"comment[instrument]\" in result\n\n    schema.delete(permanent=True)\n    for feature in features:\n        feature.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_search.py",
    "content": "import bionty as bt\nimport lamindb as ln\nimport pytest\n\n\n@pytest.fixture(scope=\"module\")\ndef prepare_cell_type_registry():\n    bt.CellType.filter().delete(permanent=True)\n    records = [\n        {\n            \"ontology_id\": \"CL:0000084\",\n            \"name\": \"T cell\",\n            \"synonyms\": \"T-cell|T-lymphocyte|T lymphocyte\",\n            \"children\": [\"CL:0000798\", \"CL:0002420\", \"CL:0002419\", \"CL:0000789\"],\n        },\n        {\n            \"ontology_id\": \"CL:0000236\",\n            \"name\": \"B cell\",\n            \"synonyms\": \"B-lymphocyte|B lymphocyte|B-cell\",\n            \"children\": [\"CL:0009114\", \"CL:0001201\"],\n        },\n        {\n            \"ontology_id\": \"CL:0000696\",\n            \"name\": \"PP cell\",\n            \"synonyms\": \"type F enteroendocrine cell\",\n            \"children\": [\"CL:0002680\"],\n        },\n        {\n            \"ontology_id\": \"CL:0002072\",\n            \"name\": \"nodal myocyte\",\n            \"synonyms\": \"P cell|myocytus nodalis|cardiac pacemaker cell\",\n            \"children\": [\"CL:1000409\", \"CL:1000410\"],\n        },\n    ]\n    public_records = []\n    for ref_record in records:\n        record = bt.CellType.from_source(ontology_id=ref_record[\"ontology_id\"])\n        assert record.name == ref_record[\"name\"]\n        assert set(record.synonyms.split(\"|\")) == set(ref_record[\"synonyms\"].split(\"|\"))\n        public_records.append(record)\n    ln.save(public_records)\n    yield \"prepared\"\n    bt.CellType.filter().delete(permanent=True)\n\n\ndef test_search_synonyms(prepare_cell_type_registry):\n    result = bt.CellType.search(\"P cell\").to_dataframe()\n    assert set(result.name.iloc[:2]) == {\"nodal myocyte\", \"PP cell\"}\n\n\ndef test_search_limit(prepare_cell_type_registry):\n    result = bt.CellType.search(\"P cell\", limit=1).to_dataframe()\n    assert len(result) == 1\n\n\ndef test_search_case_sensitive(prepare_cell_type_registry):\n    result = bt.CellType.search(\"b cell\", case_sensitive=False).to_dataframe()\n    assert result.name.iloc[0] == \"B cell\"\n\n\ndef test_search_None():\n    with pytest.raises(\n        ValueError, match=\"Cannot search for None value! Please pass a valid string.\"\n    ):\n        bt.CellType.search(None)\n"
  },
  {
    "path": "tests/core/test_settings.py",
    "content": "import lamindb as ln\nimport pytest\n\n\ndef test_settings_repr():\n    repr_str = repr(ln.settings)\n\n    lines = repr_str.split(\"\\n\")\n    assert \"Settings\" in lines[0]\n    assert all(line.startswith(\"  \") for line in lines[1:])\n\n    content = \"\\n\".join(lines[1:])\n    assert content.find(\"instance:\") < content.find(\"storage:\")\n    assert content.find(\"storage:\") < content.find(\"verbosity:\")\n    assert content.find(\"verbosity:\") < content.find(\"track_run_inputs:\")\n\n\ndef test_storage_setter_raises_on_foreign_managed_storage(tmp_path):\n    storage = ln.Storage(root=(tmp_path / \"foreign-managed-storage\").as_posix()).save()\n    storage.instance_uid = \"_not_exists_\"\n    storage.save()\n\n    with pytest.raises(ValueError) as error:\n        ln.settings.storage = storage.root\n    assert (\n        error.exconly()\n        == f\"ValueError: Storage '{storage.root}' exists in another instance (_not_exists_), cannot write to it from here.\"\n    )\n    storage.delete()\n\n\ndef test_local_storage_setter_raises_on_foreign_managed_storage(tmp_path):\n    storage = ln.Storage(\n        root=(tmp_path / \"foreign-managed-local-storage\").as_posix()\n    ).save()\n    storage.instance_uid = \"_not_exists_\"\n    storage.save()\n\n    with pytest.raises(ValueError) as error:\n        ln.settings.local_storage = storage.root\n    assert (\n        error.exconly()\n        == f\"ValueError: Storage '{storage.root}' exists in another instance (_not_exists_), cannot write to it from here.\"\n    )\n    storage.delete()\n"
  },
  {
    "path": "tests/core/test_sqlrecord.py",
    "content": "import re\nimport shutil\nimport textwrap\nfrom pathlib import Path\n\nimport bionty as bt\nimport lamindb as ln\nimport pandas as pd\nimport pytest\nfrom lamindb.errors import FieldValidationError\nfrom lamindb.models.sqlrecord import (\n    _get_record_kwargs,\n    _search,\n    get_name_field,\n    suggest_records_with_similar_names,\n)\n\n\ndef test_feature_describe():\n    description = textwrap.dedent(\"\"\"\\\n    Feature\n      Simple fields\n        .uid: CharField\n        .name: CharField\n        .unit: CharField\n        .description: TextField\n        .array_rank: SmallIntegerField\n        .array_size: IntegerField\n        .array_shape: JSONField\n        .synonyms: TextField\n        .default_value: JSONField\n        .nullable: BooleanField\n        .coerce: BooleanField\n        .is_type: BooleanField\n        .is_locked: BooleanField\n        .created_at: DateTimeField\n        .updated_at: DateTimeField\n      Relational fields\n        .branch: Branch\n        .created_on: Branch\n        .space: Space\n        .created_by: User\n        .run: Run\n        .type: Feature\n        .schemas: Schema\n        .features: Feature\n        .values: JsonValue\n        .projects: Project\n        .ablocks: FeatureBlock\n    \"\"\").strip()\n    assert description == ln.Feature.describe(return_str=True)\n\n\ndef test_artifact_describe():\n    description = textwrap.dedent(\"\"\"\\\n    Artifact\n      Simple fields\n        .uid: CharField\n        .key: CharField\n        .description: TextField\n        .suffix: CharField\n        .kind: CharField\n        .otype: CharField\n        .size: BigIntegerField\n        .hash: CharField\n        .n_files: BigIntegerField\n        .n_observations: BigIntegerField\n        .version_tag: CharField\n        .is_latest: BooleanField\n        .is_locked: BooleanField\n        .created_at: DateTimeField\n        .updated_at: DateTimeField\n      Relational fields\n        .branch: Branch\n        .created_on: Branch\n        .space: Space\n        .storage: Storage\n        .run: Run\n        .schema: Schema\n        .created_by: User\n        .input_of_runs: Run\n        .recreating_runs: Run\n        .schemas: Schema\n        .json_values: JsonValue\n        .artifacts: Artifact\n        .linked_in_records: Record\n        .users: User\n        .runs: Run\n        .linked_by_runs: Run\n        .ulabels: ULabel\n        .linked_by_artifacts: Artifact\n        .collections: Collection\n        .records: Record\n        .references: Reference\n        .projects: Project\n        .ablocks: ArtifactBlock\n      Bionty fields\n        .organisms: bionty.Organism\n        .genes: bionty.Gene\n        .proteins: bionty.Protein\n        .cell_markers: bionty.CellMarker\n        .tissues: bionty.Tissue\n        .cell_types: bionty.CellType\n        .diseases: bionty.Disease\n        .cell_lines: bionty.CellLine\n        .phenotypes: bionty.Phenotype\n        .pathways: bionty.Pathway\n        .experimental_factors: bionty.ExperimentalFactor\n        .developmental_stages: bionty.DevelopmentalStage\n        .ethnicities: bionty.Ethnicity\n    \"\"\").strip()\n    assert description == ln.Artifact.describe(return_str=True)\n\n\ndef test_repr_describe():\n    user = ln.User.filter().first()\n    assert user.__repr__().startswith(\"User\")\n    assert user.describe(return_str=True).startswith(\"User\")\n\n\ndef test_record_describe_includes_features():\n    record = ln.Record(name=\"describe record\").save()\n    feature = ln.Feature(name=\"describe_metric\", dtype=float).save()\n    record.features.add_values({\"describe_metric\": 1.23})\n\n    output = record.describe(return_str=True)\n    assert \"Features\" in output\n    assert \"describe_metric\" in output\n    assert \"1.23\" in output\n\n    record.delete(permanent=True)\n    feature.delete(permanent=True)\n\n\ndef test_validate_literal_fields():\n    # validate literal\n    with pytest.raises(FieldValidationError):\n        ln.Transform(key=\"new-name-not-existing-123\", kind=\"invalid\")\n\n\ndef test_init_with_args():\n    with pytest.raises(\n        FieldValidationError,\n        match=re.escape(\n            \"Use keyword arguments instead of positional arguments, e.g.: User(name='...')\"\n        )\n        + r\".*\",\n    ):\n        # can't use Record here because it raises \"Only one non-keyword arg allowed\"\n        ln.User(\"an arg\")\n\n\ndef test_validate_required_fields():\n    # ULabel has a required name\n    with pytest.raises(FieldValidationError):\n        ln.ULabel()\n    # ULabel has a required name\n    with pytest.raises(FieldValidationError):\n        ln.ULabel(description=\"test\")\n\n\n@pytest.fixture\ndef get_search_test_filepaths():\n    Path(\"unregistered_storage/\").mkdir(exist_ok=True)\n    filepaths = [Path(f\"./unregistered_storage/test-search{i}.txt\") for i in range(6)]\n    for filepath in filepaths:\n        filepath.write_text(filepath.name)\n    yield None\n    shutil.rmtree(\"unregistered_storage/\")\n\n\ndef test_search_and_get(get_search_test_filepaths):\n    artifact1 = ln.Artifact(\n        \"./unregistered_storage/test-search1.txt\", description=\"nonsense\"\n    )\n    artifact1.save()\n    artifact2 = ln.Artifact(\n        \"./unregistered_storage/test-search2.txt\", description=\"nonsense\"\n    )\n    artifact2.save()\n\n    # on purpose to be search3 to test duplicated search\n    artifact0 = ln.Artifact(\n        \"./unregistered_storage/test-search0.txt\", description=\"test-search3\"\n    )\n    artifact0.save()\n    artifact3 = ln.Artifact(\n        \"./unregistered_storage/test-search3.txt\", description=\"test-search3\"\n    )\n    artifact3.save()\n    artifact4 = ln.Artifact(\n        \"./unregistered_storage/test-search4.txt\", description=\"test-search4\"\n    )\n    artifact4.save()\n\n    result = ln.Artifact.search(\"search3\").to_dataframe()\n    assert result.iloc[0].description == \"test-search3\"\n    assert result.iloc[1].description == \"test-search3\"\n\n    # no returning entries if all search results have __ratio__ 0\n    # need a better search string below\n    # assert ln.Artifact.search(\"x\").shape[0] == 0\n\n    artifact5 = ln.Artifact(\n        \"./unregistered_storage/test-search5.txt\", key=\"test-search5.txt\"\n    )\n    artifact5.save()\n    res = ln.Artifact.search(\"search5\").to_dataframe()\n    assert res.iloc[0].key == \"test-search5.txt\"\n\n    res_q = ln.Artifact.search(\"search5\")\n    assert res_q[0].key == \"test-search5.txt\"\n    # queryset returns the same order of results\n    assert res.uid.tolist() == [i.uid for i in res_q]\n\n    # multi-field search\n    res = ln.Artifact.search(\n        \"txt\", field=[\"key\", \"description\", \"suffix\"]\n    ).to_dataframe()\n    assert res.iloc[0].suffix == \".txt\"\n\n    # get\n\n    artifact = ln.Artifact.get(description=\"test-search4\")\n    assert artifact == artifact4\n\n    with pytest.raises(ln.errors.ObjectDoesNotExist):\n        ln.Artifact.get(description=\"test-does-not-exist\")\n\n    artifact0.delete(permanent=True, storage=True)\n    artifact1.delete(permanent=True, storage=True)\n    artifact2.delete(permanent=True, storage=True)\n    artifact3.delete(permanent=True, storage=True)\n    artifact4.delete(permanent=True, storage=True)\n    artifact5.delete(permanent=True, storage=True)\n\n\ndef test_suggest_similar_names():\n    record1 = ln.Record(name=\"Test experiment 1\").save()\n    record2 = ln.Record(name=\"Test experiment 2\").save()\n    record3 = ln.Record(name=\"Special test experiment abc\").save()\n    record4 = ln.Record(name=\"A very special test experiment abc\").save()\n\n    assert ln.Record(name=\"Test experiment 1\").uid == record1.uid\n\n    assert suggest_records_with_similar_names(\n        record1, \"name\", {\"name\": \"Test experiment 1\"}\n    )\n    assert not suggest_records_with_similar_names(\n        record2, \"name\", {\"name\": \"Test experiment 123\"}\n    )\n\n    queryset = _search(\n        ln.Record,\n        \"Test experiment 123\",\n        field=\"name\",\n        truncate_string=True,\n        limit=3,\n    )\n    assert queryset.count() == 3\n\n    queryset = _search(\n        ln.Record,\n        \"Special test experiment abc\",\n        field=\"name\",\n        truncate_string=True,\n        limit=3,\n    )\n    assert queryset.count() == 2\n    assert queryset[0].name == \"Special test experiment abc\"\n\n    record1.delete(permanent=True)\n    record2.delete(permanent=True)\n    record3.delete(permanent=True)\n    record4.delete(permanent=True)\n\n\ndef test_pass_version():\n    # creating a new transform on key retrieves the same transform\n    # for as long as no source_code was saved\n    transform = ln.Transform(key=\"mytransform\", version=\"1\").save()\n    assert transform.version_tag == \"1\"\n    assert transform.version == \"1\"\n    assert ln.Transform(key=\"mytransform\", version=\"1\") == transform\n    # in case source code is saved\n    transform.source_code = \"dummy\"\n    transform.save()\n    with pytest.raises(ValueError) as e:\n        ln.Transform(key=\"mytransform\", version=\"1\")\n    assert (\n        e.exconly()\n        == \"ValueError: Please change the version tag or leave it `None`, '1' is already taken\"\n    )\n\n\ndef test_delete():\n    record = ln.Record(name=\"test-delete\")\n    # record not yet saved, delete has no effect\n    result = record.delete()\n    assert result is None\n    assert record.branch_id == 1\n    record.save()\n    result = record.delete()\n    assert result is None\n    assert record.branch_id == -1\n    result = record.delete(permanent=True)\n    assert isinstance(result, tuple)\n    assert len(result) == 2\n    deleted_count, deleted_dict = result\n    assert deleted_count == 1\n    assert isinstance(deleted_dict, dict)\n    assert ln.Record.filter(name=\"test-delete\").exists() is False\n\n\ndef test_get_name_field():\n    transform = ln.Transform(key=\"test\").save()\n    assert get_name_field(ln.Run(transform)) == \"started_at\"\n    with pytest.raises(ValueError):\n        get_name_field(ln.Artifact.records.through())\n    transform.delete(permanent=True)\n\n\ndef test_using():\n    # the two below calls error if the records aren't found\n    ln.Artifact.connect(\"laminlabs/lamin-site-assets\").get(1)\n    ln.Artifact.connect(\"laminlabs/lamin-site-assets\").get(uid=\"MqEaGU7fXvxNy61R0000\")\n    # cross-database query\n    hemangioblast = bt.CellType.from_source(name=\"hemangioblast\").save()\n    artifact = (\n        ln.Artifact.connect(\"laminlabs/lamin-dev\")\n        .filter(cell_types=hemangioblast)\n        .first()\n    )\n    assert artifact is not None\n    hemangioblast_dev = artifact.cell_types.get(name=\"hemangioblast\")\n    assert hemangioblast_dev.uid == hemangioblast.uid\n    assert hemangioblast_dev.id != hemangioblast.id\n    # query via list\n    artifact_ref = (\n        ln.Artifact.connect(\"laminlabs/lamin-dev\")\n        .filter(cell_types__in=[hemangioblast])\n        .first()\n    )\n    assert artifact == artifact_ref\n    # check that .using provided with the current intance does nothing\n    assert ln.User.connect(\"lamindb-unit-tests-core\").first()._state.db == \"default\"\n    user = ln.setup.settings.user.handle\n    assert (\n        ln.User.connect(f\"{user}/lamindb-unit-tests-core\").first()._state.db\n        == \"default\"\n    )\n\n\ndef test_get_record_kwargs():\n    assert _get_record_kwargs(ln.Feature) == [\n        (\"name\", \"str\"),\n        (\"dtype\", \"DtypeStr | ULabel | Record | Registry | list[Registry] | FieldAttr\"),\n        (\"type\", \"Feature | None\"),\n        (\"is_type\", \"bool\"),\n        (\"unit\", \"str | None\"),\n        (\"description\", \"str | None\"),\n        (\"synonyms\", \"str | None\"),\n        (\"nullable\", \"bool | None\"),\n        (\n            \"default_value\",\n            \"Any | None\",\n        ),\n        (\"coerce\", \"bool | None\"),\n        (\n            \"cat_filters\",\n            \"dict[str\",\n        ),\n    ]\n\n\ndef test_get_record_kwargs_empty():\n    class EmptySQLRecord:\n        pass\n\n    assert _get_record_kwargs(EmptySQLRecord) == []\n\n    class NoInitSQLRecord:\n        def method(self):\n            pass\n\n    assert _get_record_kwargs(NoInitSQLRecord) == []\n\n\ndef test_soft_delete_error():\n    with pytest.raises(ValueError):\n        ln.Storage.filter().first().delete(permanent=False)\n\n    with pytest.raises(ValueError):\n        ln.Branch.filter().first().delete(permanent=False)\n\n\ndef test_delete_return_value_permanent():\n    \"\"\"Test that permanent delete returns Django's natural return value.\"\"\"\n    # Test with ULabel (simple SQLRecord)\n    ulabel = ln.ULabel(name=\"test-delete-return\").save()\n    result = ulabel.delete(permanent=True)\n    assert isinstance(result, tuple)\n    assert len(result) == 2\n    deleted_count, deleted_dict = result\n    assert deleted_count == 1\n    assert isinstance(deleted_dict, dict)\n    assert len(deleted_dict) > 0\n    # Check that the registry name is in the dict\n    # Django returns app_label.ClassName format\n    registry_name = f\"{ulabel._meta.app_label}.{ulabel.__class__.__name__}\"\n    assert registry_name in deleted_dict\n    assert deleted_dict[registry_name] == 1\n\n\ndef test_unsaved_relationship_modification_attempts():\n    af = ln.Artifact.from_dataframe(\n        pd.DataFrame({\"col1\": [1, 2, 3], \"col2\": [4, 5, 6]}), description=\"testme\"\n    )\n\n    new_label = ln.Record(name=\"testlabel\").save()\n    with pytest.raises(ValueError) as excinfo:\n        af.records.add(new_label)\n\n    assert (\n        str(excinfo.value)\n        == \"You are trying to access the many-to-many relationships of an unsaved Artifact object. Please save it first using '.save()'.\"\n    )\n\n    new_label.delete(permanent=True)\n    af.delete(permanent=True)\n\n\ndef test_failed_connect():\n    with pytest.raises(ln.setup.errors.InstanceNotFoundError) as error:\n        ln.Artifact.connect(\"laminlabs/lamindata-not-existing\")\n    assert error.exconly().startswith(\n        \"lamindb_setup.errors.InstanceNotFoundError: 'laminlabs/lamindata-not-existing' not found: 'instance-not-found'\"\n    )\n\n\ndef test_unsaved_model_different_instance():\n    af = ln.Artifact.connect(\"laminlabs/lamindata\").get(\n        key=\"scrna/micro-macfarland2020.h5ad\"\n    )\n\n    new_label = ln.Record(name=\"testlabel\").save()\n    with pytest.raises(ValueError) as excinfo:\n        af.records.add(new_label)\n\n    assert (\n        str(excinfo.value)\n        == \"Cannot label a record from instance 'laminlabs/lamindata'. \"\n        \"Please save the record first to your instance using '.save()'.\"\n    )\n\n    new_label.delete(permanent=True)\n\n\ndef test_track_fields_with_deferred_columns(example_dataframe: pd.DataFrame):\n    artifact = ln.Artifact.from_dataframe(\n        example_dataframe, key=\"deferred-track-fields.parquet\"\n    ).save()\n\n    # loading a tracked field as deferred should not crash in __init__\n    deferred_artifact = ln.Artifact.filter(id=artifact.id).only(\"id\").one()\n    assert deferred_artifact.id == artifact.id\n    assert not deferred_artifact._field_changed(\"space_id\")\n\n    artifact.delete(permanent=True)\n\n\ndef test_track_fields_must_exist_on_model(monkeypatch, example_dataframe: pd.DataFrame):\n    artifact = ln.Artifact.from_dataframe(\n        example_dataframe, key=\"invalid-track-field.parquet\"\n    ).save()\n\n    monkeypatch.setattr(ln.Artifact, \"_TRACK_FIELDS\", (\"space_id\", \"not_a_real_field\"))\n\n    with pytest.raises(\n        FieldValidationError,\n        match=\"_TRACK_FIELDS contains invalid field for Artifact: not_a_real_field\",\n    ):\n        ln.Artifact.get(artifact.id)\n\n    artifact.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_storage.py",
    "content": "import concurrent.futures\n\nimport lamindb as ln\n\n\n# we need this test both in the core and the storage/cloud tests\n# because the internal logic that retrieves information about other instances\n# depends on whether the current instance is managed on the hub\ndef test_reference_storage_location(ccaplog):\n    ln.Artifact(\"s3://lamindata/iris_studies/study0_raw_images\")\n    assert ln.Storage.get(root=\"s3://lamindata\").instance_uid == \"4XIuR0tvaiXM\"\n    assert (\n        \"referenced read-only storage location at s3://lamindata, is managed by instance with uid 4XIuR0tvaiXM\"\n        in ccaplog.text\n    )\n\n\ndef test_create_storage_locations_parallel():\n    root: str = \"nonregistered_storage\"\n\n    def create_storage() -> str:\n        ln.Storage(root=root).save()  # type: ignore\n        return root\n\n    n_parallel = 3\n    with concurrent.futures.ThreadPoolExecutor(max_workers=n_parallel) as executor:\n        futures = [executor.submit(create_storage) for i in range(n_parallel)]\n        _ = [future.result() for future in concurrent.futures.as_completed(futures)]\n\n    storage = ln.Storage.get(root__endswith=root)\n    storage.delete()\n"
  },
  {
    "path": "tests/core/test_switch.py",
    "content": "\"\"\"Tests for ln.setup.switch.\"\"\"\n\nimport lamindb as ln\nimport pytest\n\n\ndef test_switch_create_existing_branch_raises():\n    \"\"\"Switch with create=True and existing branch raises BranchAlreadyExists with hint.\"\"\"\n    with pytest.raises(ln.errors.BranchAlreadyExists) as exc_info:\n        ln.setup.switch(\"main\", create=True)\n    msg = str(exc_info.value)\n    assert \"already exists\" in msg\n    assert \"-c/--create\" in msg or \"Omit\" in msg\n"
  },
  {
    "path": "tests/core/test_track_flow.py",
    "content": "import time\nfrom pathlib import Path\nfrom typing import Iterable\n\nimport lamindb as ln\nimport pandas as pd\nimport pytest\nfrom lamindb.errors import InvalidArgument\n\n\n@ln.flow(global_run=\"clear\")\ndef process_chunk(\n    chunk_id: int, artifact_param: ln.Artifact, records_params: Iterable[ln.Record]\n) -> str:\n    # Create a simple DataFrame\n    df = pd.DataFrame(\n        {\"id\": range(chunk_id * 10, (chunk_id + 1) * 10), \"value\": range(10)}\n    )\n    env_file = Path(\"file_with_same_hash.txt\")\n    env_file.write_text(\"1\")\n    ln.Artifact(env_file, description=\"file_with_same_hash\").save()\n    # Save it as an artifact\n    key = f\"chunk_{chunk_id}.parquet\"\n    artifact = ln.Artifact.from_dataframe(df, key=key).save()\n    assert ln.context.run is not None\n    return artifact.key\n\n\ndef test_flow():\n    param_artifact = ln.Artifact(\".gitignore\", key=\"param_artifact\").save()\n    ln.Record(name=\"record1\").save(), ln.Record(name=\"record2\").save()\n    records_params = ln.Record.filter(name__startswith=\"record\")\n\n    assert ln.context.run is None\n    artifact_key = process_chunk(1, param_artifact, records_params)\n    assert ln.context.run is None\n\n    # Verify the artifacts and runs\n    artifacts = [ln.Artifact.get(key=key) for key in [artifact_key]]\n    same_hash_artifacts = ln.Artifact.filter(description=\"file_with_same_hash\")\n\n    runs = [artifact.run for artifact in artifacts]\n\n    # Verify each run has the correct start and finish times\n    for run in runs:\n        print(f\"Run details: {run}\")\n        assert run.started_at is not None\n        assert run.finished_at is not None\n        assert run.started_at < run.finished_at\n        assert run.status == \"completed\"\n        assert isinstance(run.params[\"chunk_id\"], int)\n        assert run.params[\"artifact_param\"].startswith(\n            f\"Artifact[{param_artifact.uid}]\"\n        )\n        assert run.params[\"records_params\"] == [\n            f\"Record[{record.uid}]\" for record in records_params\n        ]\n\n    # test error behavior\n    with pytest.raises(RuntimeError) as error:\n        ln.context._run = run\n        process_chunk(1, param_artifact, records_params)\n        ln.context._run = None\n    assert str(error.exconly()).startswith(\n        \"RuntimeError: Please use @ln.step() or clear the global run context before using @ln.flow(): no `ln.track()` or `@ln.flow(global_run='clear')`\"\n    )\n\n    # Clean up test artifacts\n    runs = []\n    for artifact in artifacts:\n        runs.append(artifact.run)\n        artifact.delete(permanent=True)\n    param_artifact.delete(permanent=True)\n    same_hash_artifacts[0].delete(permanent=True)\n    Path(\"file_with_same_hash.txt\").unlink()\n    for run in runs:\n        run.delete(permanent=True)\n    ln.context._run = None\n\n\ndef test_flow_track_arg_aliases_implicit():\n    unique = time.time_ns()\n    missing_project = f\"missing-flow-project-{unique}\"\n\n    @ln.flow(global_run=\"clear\")\n    def flow_with_implicit_project_alias(project: str) -> None:\n        pass\n\n    with pytest.raises(InvalidArgument) as error:\n        flow_with_implicit_project_alias(project=missing_project)\n    assert error.exconly().startswith(\n        f\"lamindb.errors.InvalidArgument: Project '{missing_project}' not found\"\n    )\n\n\ndef test_flow_track_arg_aliases_false():\n    unique = time.time_ns()\n    missing_project = f\"missing-flow-project-{unique}\"\n\n    @ln.flow(global_run=\"clear\", track_arg_aliases=False)\n    def flow_without_project_alias(project: str) -> str:\n        assert ln.context.run is not None\n        return ln.context.run.uid\n\n    run = None\n    try:\n        run_uid = flow_without_project_alias(project=missing_project)\n        run = ln.Run.get(uid=run_uid)\n        assert run.params[\"project\"] == missing_project\n    finally:\n        ln.context._run = None\n        if run is not None:\n            run.delete(permanent=True)\n            run.transform.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_track_script_or_notebook.py",
    "content": "import signal\nimport subprocess\nimport sys\nimport time\nfrom pathlib import Path\nfrom unittest.mock import MagicMock, patch\n\nimport lamindb as ln\nimport lamindb_setup as ln_setup\nimport pytest\nfrom lamindb._finish import clean_r_notebook_html, get_shortcut\nfrom lamindb._secret_redaction import redact_secrets_in_source_code\nfrom lamindb.core._context import (\n    REDACTED_SECRET_VALUE,\n    LogStreamTracker,\n    context,\n    detect_and_process_source_code_file,\n    serialize_params_to_json,\n)\nfrom lamindb.errors import InvalidArgument, TrackNotCalled, ValidationError\nfrom lamindb_setup.core.upath import UPath\n\nSCRIPTS_DIR = Path(__file__).parent.resolve() / \"scripts\"\nNOTEBOOKS_DIR = Path(__file__).parent.resolve() / \"notebooks\"\n\n\ndef test_serialize_params_to_json():\n    a_path = Path(\"/some/local/folder\")\n    a_upath = UPath(\"s3://bucket/key\")\n    params = {\n        \"path_key\": a_path,\n        \"none_key\": None,\n        \"empty_list_key\": [],\n        \"list_str_key\": [\"string\"],\n        \"upath_key\": a_upath,\n        \"str_key\": \"plain\",\n        \"api_key\": \"test-api-key-value\",\n        \"openAIApiKey\": \"another-secret\",\n        \"database_url\": \"postgresql://db_user:db_password@db.example.com:5432/mydb\",\n    }\n    result = serialize_params_to_json(params)\n    # None is omitted\n    assert \"none_key\" not in result\n    # Empty list is omitted (same as None)\n    assert \"empty_list_key\" not in result\n    # Path is serialized to posix string\n    assert result[\"path_key\"] == \"/some/local/folder\"\n    # UPath is serialized to posix string\n    assert result[\"upath_key\"] == \"s3://bucket/key\"\n    # List of strings is JSON-serialized as-is (list[cat ? str])\n    assert result[\"list_str_key\"] == [\"string\"]\n    # Other values unchanged\n    assert result[\"str_key\"] == \"plain\"\n    assert result[\"api_key\"] == REDACTED_SECRET_VALUE\n    assert result[\"openAIApiKey\"] == REDACTED_SECRET_VALUE\n    assert result[\"database_url\"] == REDACTED_SECRET_VALUE\n    assert set(result.keys()) == {\n        \"path_key\",\n        \"upath_key\",\n        \"str_key\",\n        \"list_str_key\",\n        \"api_key\",\n        \"openAIApiKey\",\n        \"database_url\",\n    }\n\n\ndef test_redact_secrets_in_source_code():\n    source_code = \"\"\"\napi_key = \"test-api-key-value\"\nopenAIApiKey = \"another-secret\"\nuid = \"a6yhtobqTjQM6q8t\"\ndb_url = \"postgresql://db_user:db_password@db.example.com:5432/mydb\"\nos.environ[\"API_KEY\"] = \"sdk-key\"\nconfig = {\"client_secret\": \"client-secret-value\", \"id\": \"abc123\"}\n\"\"\"\n    redacted, redaction_count = redact_secrets_in_source_code(source_code)\n    assert redaction_count == 5\n    assert 'api_key = \"***REDACTED***\"' in redacted\n    assert 'openAIApiKey = \"***REDACTED***\"' in redacted\n    assert 'db_url = \"***REDACTED***\"' in redacted\n    assert 'os.environ[\"API_KEY\"] = \"***REDACTED***\"' in redacted\n    assert '\"client_secret\": \"***REDACTED***\"' in redacted\n    assert 'uid = \"a6yhtobqTjQM6q8t\"' in redacted\n\n\ndef test_redact_secrets_in_source_code_keeps_env_references():\n    source_code = \"\"\"\napi_key = os.getenv(\"OPENAI_API_KEY\")\nopenAIApiKey = getenv(\"OPENAI_API_KEY\")\nmodel_api_key = os.environ[\"MODEL_API_KEY\"]\nprovider_token = os.environ.get(\"PROVIDER_TOKEN\")\n\"\"\"\n    redacted, redaction_count = redact_secrets_in_source_code(source_code)\n    # Env lookups are references, not embedded literals. Keep them for rerunnable source code.\n    assert redaction_count == 0\n    assert 'api_key = os.getenv(\"OPENAI_API_KEY\")' in redacted\n    assert 'openAIApiKey = getenv(\"OPENAI_API_KEY\")' in redacted\n    assert 'model_api_key = os.environ[\"MODEL_API_KEY\"]' in redacted\n    assert 'provider_token = os.environ.get(\"PROVIDER_TOKEN\")' in redacted\n\n\ndef test_redact_secrets_in_source_code_ignores_annotations_and_forwarding():\n    source_code = \"\"\"\ndef run(api_key: str) -> None:\n    raise RuntimeError(\"fail\")\n\nrun_agent(\n    api_key=api_key,\n)\n\"\"\"\n    redacted, redaction_count = redact_secrets_in_source_code(source_code)\n    # Do not treat Python type annotations or argument forwarding as hardcoded secrets.\n    assert redaction_count == 0\n    assert \"def run(api_key: str) -> None:\" in redacted\n    assert \"api_key=api_key,\" in redacted\n\n\ndef test_serialize_params_to_json_redacts_provider_api_key_names():\n    params = {\n        \"LAMIN_API_KEY\": \"lamin-super-secret\",\n        \"OPENAI_API_KEY\": \"openai-super-secret\",\n        \"ANTHROPIC_API_KEY\": \"anthropic-super-secret\",\n        \"GEMINI_API_KEY\": \"gemini-super-secret\",\n        \"provider_name\": \"safe-value\",\n    }\n    result = serialize_params_to_json(params)\n    assert result[\"LAMIN_API_KEY\"] == REDACTED_SECRET_VALUE\n    assert result[\"OPENAI_API_KEY\"] == REDACTED_SECRET_VALUE\n    assert result[\"ANTHROPIC_API_KEY\"] == REDACTED_SECRET_VALUE\n    assert result[\"GEMINI_API_KEY\"] == REDACTED_SECRET_VALUE\n    assert result[\"provider_name\"] == \"safe-value\"\n\n\ndef test_redact_secrets_in_source_code_redacts_provider_api_key_names():\n    source_code = \"\"\"\nLAMIN_API_KEY = \"lamin-super-secret\"\nOPENAI_API_KEY = \"openai-super-secret\"\nANTHROPIC_API_KEY = \"anthropic-super-secret\"\nGEMINI_API_KEY = \"gemini-super-secret\"\nprovider = \"openai\"\n\"\"\"\n    redacted, redaction_count = redact_secrets_in_source_code(source_code)\n    assert redaction_count == 4\n    assert 'LAMIN_API_KEY = \"***REDACTED***\"' in redacted\n    assert 'OPENAI_API_KEY = \"***REDACTED***\"' in redacted\n    assert 'ANTHROPIC_API_KEY = \"***REDACTED***\"' in redacted\n    assert 'GEMINI_API_KEY = \"***REDACTED***\"' in redacted\n    assert 'provider = \"openai\"' in redacted\n\n\ndef test_track_basic_invocation():\n    project = \"non-existing project\"\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        ln.track(project=project)\n    assert (\n        error.exconly()\n        == f\"lamindb.errors.InvalidArgument: Project '{project}' not found, either create it with `ln.Project(name='...').save()` or fix typos.\"\n    )\n    space = \"non-existing space\"\n    with pytest.raises(ln.errors.InvalidArgument) as error:\n        ln.track(space=space)\n    assert (\n        error.exconly()\n        == f\"lamindb.errors.InvalidArgument: Space '{space}', please check on the hub UI whether you have the correct `uid` or `name`.\"\n    )\n\n    test_transform = ln.Transform(key=\"test_transform\").save()\n\n    # first invocation using features\n    kwargs = {\"param1\": 1, \"param2\": \"my-string\", \"param3\": 3.14}\n    with pytest.raises(ValidationError) as exc:\n        ln.track(transform=test_transform, features=kwargs)\n    assert exc.exconly().startswith(\n        \"\"\"lamindb.errors.ValidationError: These keys could not be validated: ['param1', 'param2', 'param3']\"\"\"\n    )\n    feature1 = ln.Feature(name=\"param1\", dtype=int).save()\n    feature2 = ln.Feature(name=\"param2\", dtype=str).save()\n    feature3 = ln.Feature(name=\"param3\", dtype=float).save()\n    feature4 = ln.Feature(name=\"label_param\", dtype=ln.Record).save()\n    record = ln.Record(name=\"my_label\").save()\n    kwargs[\"label_param\"] = \"my_label\"\n    ln.track(transform=test_transform, features=kwargs)\n    assert ln.context.run.features.get_values() == kwargs\n    print(ln.context.run.features.describe(return_str=True))\n    assert (\n        ln.context.run.features.describe(return_str=True)\n        == f\"\"\"\\\nRun: {ln.context.run.uid[:7]} ({ln.context.run.transform.key})\n└── Features\n    └── label_param         Record                   my_label\n        param1              int                      1\n        param2              str                      my-string\n        param3              float                    3.14\"\"\"\n    )\n    # also call describe() plainly without further checks\n    ln.context.run.describe()\n    # second invocation\n    kwargs = {\"param1\": 1, \"param2\": \"my-string\", \"param3\": 3.14, \"param4\": [1, 2]}\n    param4 = ln.Feature(name=\"param4\", dtype=\"int\").save()\n    with pytest.raises(ValidationError) as exc:\n        ln.track(transform=test_transform, features=kwargs)\n    assert \"Column 'param4' failed dtype check for 'int': got object\" in exc.exconly()\n    # fix param4 dtype\n    param4.delete(permanent=True)\n    param4 = ln.Feature(name=\"param4\", dtype=list[int]).save()\n    # re-run\n    ln.track(transform=test_transform, features=kwargs)\n    assert ln.context.run.features.get_values() == kwargs\n\n    # now use the params arg\n    ln.track(transform=test_transform, params=kwargs)\n    assert ln.context.run.params == kwargs\n    assert ln.Run.filter(params__param1=kwargs[\"param1\"]).count() == 1\n\n    # test that run populates things like records\n    record = ln.Record(name=\"my-label-in-track\")\n    assert record.run == ln.context.run\n\n    # test that we can call ln.finish() also for pipeline-like transforms\n    run = ln.context.run\n    assert run.finished_at is None\n    ln.finish()\n    assert (\n        run.finished_at is not None\n    )  # context is cleared after finish(); use captured run\n\n    # clean up\n    run.delete(permanent=True)\n    ln.models.RunJsonValue.filter(run__transform=test_transform).delete(permanent=True)\n    ln.models.RunRecord.filter(run__transform=test_transform).delete(permanent=True)\n    feature1.delete(permanent=True)\n    feature2.delete(permanent=True)\n    feature3.delete(permanent=True)\n    feature4.delete(permanent=True)\n    param4.delete(permanent=True)\n    test_transform.delete(permanent=True)\n\n\ndef test_track_accepts_initiated_by_run_uid():\n    unique = time.time_ns()\n    parent_transform = ln.Transform(key=f\"parent-run-{unique}\").save()\n    child_transform = ln.Transform(key=f\"child-run-{unique}\").save()\n    parent_run = ln.Run(transform=parent_transform).save()\n    try:\n        ln.track(\n            transform=child_transform,\n            initiated_by_run=parent_run.uid,\n            new_run=True,\n        )\n        assert ln.context.run is not None\n        assert ln.context.run.initiated_by_run is not None\n        assert ln.context.run.initiated_by_run.uid == parent_run.uid\n        ln.finish()\n        with pytest.raises(InvalidArgument) as error:\n            ln.track(\n                transform=child_transform,\n                initiated_by_run=\"does-not-exist\",\n                new_run=True,\n            )\n        assert error.exconly().startswith(\n            \"lamindb.errors.InvalidArgument: Run 'does-not-exist' not found\"\n        )\n    finally:\n        ln.context._run = None\n        ln.Run.filter(transform=child_transform).delete(permanent=True)\n        parent_run.delete(permanent=True)\n        child_transform.delete(permanent=True)\n        parent_transform.delete(permanent=True)\n\n\ndef test_track_uses_initiated_by_run_uid_from_env(monkeypatch: pytest.MonkeyPatch):\n    unique = time.time_ns()\n    parent_transform = ln.Transform(key=f\"parent-run-env-{unique}\").save()\n    child_transform = ln.Transform(key=f\"child-run-env-{unique}\").save()\n    parent_run = ln.Run(transform=parent_transform).save()\n    try:\n        monkeypatch.setenv(\"LAMIN_INITIATED_BY_RUN_UID\", parent_run.uid)\n        ln.track(transform=child_transform, new_run=True)\n        assert ln.context.run is not None\n        assert ln.context.run.initiated_by_run is not None\n        assert ln.context.run.initiated_by_run.uid == parent_run.uid\n        ln.finish()\n    finally:\n        ln.context._run = None\n        ln.Run.filter(transform=child_transform).delete(permanent=True)\n        parent_run.delete(permanent=True)\n        child_transform.delete(permanent=True)\n        parent_transform.delete(permanent=True)\n\n\n@pytest.mark.parametrize(\"pass_plan_as_key\", [False, True], ids=[\"artifact\", \"key\"])\ndef test_track_with_plan_links_run(tmp_path, pass_plan_as_key):\n    unique = time.time_ns()\n    plan_path = tmp_path / f\"my-agent-plan-{unique}.md\"\n    plan_path.write_text(\"# Agent plan\\n\\n- Step 1\\n\")\n    plan_artifact = ln.Artifact(\n        plan_path,\n        key=f\".plans/my-agent-plan-{unique}.md\",\n        kind=\"plan\",\n    ).save()\n    transform = ln.Transform(key=f\"test-track-with-plan-{unique}\").save()\n    try:\n        plan = plan_artifact.key if pass_plan_as_key else plan_artifact\n        ln.track(transform=transform, plan=plan)\n        run = ln.context.run\n        assert run.plan is not None\n        assert run.plan.uid == plan_artifact.uid\n        run_from_db = ln.Run.get(uid=run.uid)\n        assert run_from_db.plan is not None\n        assert run_from_db.plan.uid == plan_artifact.uid\n        ln.finish()\n    finally:\n        ln.context._run = None\n        ln.Run.filter(transform=transform).delete(permanent=True)\n        plan_artifact.delete(permanent=True)\n        transform.delete(permanent=True)\n\n\n@pytest.fixture\ndef create_record():\n    \"\"\"Factory fixture that returns a function to create records.\"\"\"\n    created_records = []\n\n    def create(kind: str) -> ln.models.SQLRecord:\n        if kind == \"artifact\":\n            record = ln.Artifact(\"README.md\", key=\"README.md\").save()\n        elif kind == \"collection\":\n            a1 = ln.Artifact(\"README.md\", key=\"README.md\").save()\n            created_records.append(a1)\n            a2 = ln.Artifact(\"pyproject.toml\", key=\"pyproject.toml\").save()\n            created_records.append(a2)\n            record = ln.Collection([a1, a2], key=\"test-collection\").save()\n        created_records.append(record)\n        return record\n\n    yield create\n\n    for record in created_records[::-1]:\n        record.delete(permanent=True)\n\n\n@pytest.mark.parametrize(\"kind\", [\"artifact\", \"collection\"])\ndef test_track_input_record(create_record, kind):\n    # First run\n    ln.track()\n    previous_run = ln.context.run\n    record = create_record(kind)\n    record.cache()\n    assert (\n        record not in getattr(ln.context.run, f\"input_{kind}s\").all()\n    )  # avoid cycle with created artifact\n\n    # Second run\n    ln.track(new_run=True)\n    assert ln.context.run != previous_run\n    record = create_record(kind)\n    assert ln.context.run in record.recreating_runs.all()\n    assert record._subsequent_run_id == ln.context.run.id\n    record.cache()\n    assert (\n        record not in getattr(ln.context.run, f\"input_{kind}s\").all()\n    )  # avoid cycle with re-created artifact\n\n    # Third run\n    ln.track(new_run=True)\n    assert ln.context.run != previous_run\n    if kind == \"artifact\":\n        record = ln.Artifact.get(key=\"README.md\")\n    else:\n        record = ln.Collection.get(key=\"test-collection\")\n    record.cache()\n    assert ln.context.run not in record.recreating_runs.all()\n    assert not hasattr(record, \"_subsequent_run_id\")\n    assert record in getattr(ln.context.run, f\"input_{kind}s\").all()  # regular input\n\n\ndef test_track_notebook_colab():\n    notebook_path = \"/fileId=1KskciVXleoTeS_OGoJasXZJreDU9La_l\"\n    ln.context._track_notebook(path_str=notebook_path)\n\n\ndef test_track_notebook_untitled():\n    notebook_path = \"Untitled.ipynb\"\n    with pytest.raises(RuntimeError) as error:\n        ln.context._track_notebook(path_str=notebook_path)\n    assert (\n        \"Your notebook file name is 'Untitled.ipynb', please rename it before tracking. You might have to re-start your notebook kernel.\"\n        in error.exconly()\n    )\n\n\ndef test_detect_and_process_source_code_file_returns_key_from_module_for_package():\n    \"\"\"When path is inferred from stack and caller __name__ has '.', key_from_module is module path.\"\"\"\n    script_path = str(SCRIPTS_DIR / \"script-to-test-versioning.py\")\n    mock_frame = MagicMock()\n    mock_frame.f_globals = {\"__name__\": \"mypackage.mymodule\"}\n    with patch(\"inspect.stack\") as mock_stack:\n        mock_stack.return_value = [\n            MagicMock(),\n            MagicMock(),\n            (\n                mock_frame,\n                script_path,\n                MagicMock(),\n                MagicMock(),\n                MagicMock(),\n                MagicMock(),\n            ),\n        ]\n        path, kind, ref, ref_type, key_from_module = (\n            detect_and_process_source_code_file(path=None)\n        )\n    assert key_from_module == \"pypackages/mypackage/mymodule.py\"\n    assert path == Path(script_path)\n\n\ndef test_detect_and_process_source_code_file_returns_none_key_for_script():\n    \"\"\"When path is inferred from stack and caller __name__ has no '.', key_from_module is None.\"\"\"\n    script_path = str(SCRIPTS_DIR / \"script-to-test-versioning.py\")\n    mock_frame = MagicMock()\n    mock_frame.f_globals = {\"__name__\": \"__main__\"}\n    with patch(\"inspect.stack\") as mock_stack:\n        mock_stack.return_value = [\n            MagicMock(),\n            MagicMock(),\n            (\n                mock_frame,\n                script_path,\n                MagicMock(),\n                MagicMock(),\n                MagicMock(),\n                MagicMock(),\n            ),\n        ]\n        path, kind, ref, ref_type, key_from_module = (\n            detect_and_process_source_code_file(path=None)\n        )\n    assert key_from_module is None\n\n\ndef test_finish_before_track():\n    ln.context._run = None\n    with pytest.raises(TrackNotCalled) as error:\n        ln.finish()\n    assert \"Please run `ln.track()` before `ln.finish()\" in error.exconly()\n\n\ndef test_invalid_transform_kind():\n    transform = ln.Transform(key=\"test transform\")\n    ln.track(transform=transform)\n    ln.context._path = None\n    ln.context.run.transform.kind = \"script\"\n    with pytest.raises(ValueError) as error:\n        ln.finish()\n    assert \"Transform type is not allowed to be\" in error.exconly()\n\n    # unset to remove side effects\n    ln.context._run = None\n\n\ndef test_create_or_load_transform():\n    title = \"title\"\n    version = \"2.0\"\n    uid = \"NJvdsWWbJlZS0000\"\n    context.uid = uid\n    context.version = version\n    context._path = Path(\"my-test-transform-create-or-load.py\")\n    context._path.touch(exist_ok=True)\n    context._create_or_load_transform(\n        description=title,\n        transform_kind=\"notebook\",\n    )\n    assert context._transform.uid == uid\n    assert context._transform.version_tag == version\n    assert context._transform.description == title\n    context._create_or_load_transform(\n        description=title,\n    )\n    assert context._transform.uid == uid\n    assert context._transform.version_tag == version\n    assert context._transform.description == title\n\n    # now, test an updated transform name\n    context._create_or_load_transform(\n        description=\"updated title\",\n    )\n    assert context._transform.uid == uid\n    assert context._transform.version_tag == version\n    assert context._transform.description == \"updated title\"\n\n    # unset to remove side effects\n    ln.context._uid = None\n    ln.context._run = None\n    ln.context._transform = None\n    ln.context._path.unlink()\n    ln.context._path = None\n\n\ndef test_create_or_load_transform_warns_when_outside_dev_dir(\n    tmp_path, ccaplog: pytest.LogCaptureFixture\n):\n    previous_dev_dir = ln_setup.settings.dev_dir\n    path_outside_dev_dir = tmp_path / f\"outside-{time.time_ns()}.py\"\n    path_outside_dev_dir.write_text(\"print('track test')\\n\")\n    expected_key = path_outside_dev_dir.name\n    transform: ln.Transform | None = None\n    try:\n        ln_setup.settings.dev_dir = tmp_path / \"configured-dev-dir\"\n        ln_setup.settings.dev_dir.mkdir(exist_ok=True)\n        ccaplog.clear()\n        context._path = path_outside_dev_dir\n        context._create_or_load_transform(description=\"outside dev dir warning test\")\n        transform = context._transform\n        assert \"falling back to using filename as transform key\" in ccaplog.text\n        assert transform.key == expected_key\n    finally:\n        ln_setup.settings.dev_dir = previous_dev_dir\n        ln.context._uid = None\n        ln.context._run = None\n        ln.context._transform = None\n        ln.context._path = None\n        if transform is not None:\n            transform.delete(permanent=True)\n\n\ndef test_run_scripts():\n    # regular execution\n    result = subprocess.run(  # noqa: S602\n        f\"python {SCRIPTS_DIR / 'script-to-test-versioning.py --param 42'}\",\n        shell=True,\n        capture_output=True,\n    )\n    assert result.returncode == 0\n    assert \"created Transform('Ro1gl7n8YrdH0000'\" in result.stdout.decode()\n    assert \"started new Run(\" in result.stdout.decode()\n    transform = ln.Transform.get(\"Ro1gl7n8YrdH0000\")\n    assert transform.latest_run.cli_args == \"--param 42\"\n\n    # updated key (filename change)\n    result = subprocess.run(  # noqa: S602\n        f\"python {SCRIPTS_DIR / 'script-to-test-filename-change.py'}\",\n        shell=True,\n        capture_output=True,\n    )\n    assert result.returncode == 0\n    assert \"renaming transform\" in result.stdout.decode()\n    transform = ln.Transform.get(key=\"script-to-test-filename-change.py\")\n    assert transform.latest_run.cli_args is None\n\n    # version already taken\n    result = subprocess.run(  # noqa: S602\n        f\"python {SCRIPTS_DIR / 'duplicate1/script-to-test-versioning.py'}\",\n        shell=True,\n        capture_output=True,\n    )\n    assert result.returncode == 1\n    assert (\n        \"✗ version '1' is already taken by Transform('Ro1gl7n8YrdH0000'); please set another version, e.g., ln.context.version = '1.1'\"\n        in result.stderr.decode()\n    )\n\n    # regular version bump\n    result = subprocess.run(  # noqa: S602\n        f\"python {SCRIPTS_DIR / 'duplicate2/script-to-test-versioning.py'}\",\n        shell=True,\n        capture_output=True,\n    )\n    assert result.returncode == 0\n    assert \"created Transform('Ro1gl7n8YrdH0002'\" in result.stdout.decode()\n    assert \"started new Run(\" in result.stdout.decode()\n    assert not ln.Transform.get(\"Ro1gl7n8YrdH0001\").is_latest\n    assert ln.Transform.get(\"Ro1gl7n8YrdH0002\").is_latest\n\n    # inconsistent version\n    result = subprocess.run(  # noqa: S602\n        f\"python {SCRIPTS_DIR / 'duplicate3/script-to-test-versioning.py'}\",\n        shell=True,\n        capture_output=True,\n    )\n    assert result.returncode == 1\n    assert (\n        \"Transform is already tagged with version 2, but you passed 3\"\n        in result.stderr.decode()\n    )\n\n    # multiple folders, do not match the key because of the folder structure\n    ln.Transform.filter(key__endswith=\"script-to-test-versioning.py\").update(\n        key=\"teamA/script-to-test-versioning.py\"\n    )\n    # this test creates a transform with key script-to-test-versioning.py at the root level\n    result = subprocess.run(  # noqa: S602\n        f\"python {SCRIPTS_DIR / 'duplicate4/script-to-test-versioning.py'}\",\n        shell=True,\n        capture_output=True,\n    )\n    assert result.returncode == 0\n    assert \"ignoring transform\" in result.stdout.decode()\n\n    transform = ln.Transform.get(key=\"script-to-test-versioning.py\")\n\n    # multiple folders, match the key, also test is finished\n    result = subprocess.run(  # noqa: S602\n        f\"python {SCRIPTS_DIR / 'duplicate5/script-to-test-versioning.py'}\",\n        shell=True,\n        capture_output=True,\n    )\n    assert result.returncode == 0\n    assert f\"{transform.stem_uid}\" in result.stdout.decode()\n    assert \"making new version\" in result.stdout.decode()\n\n    transform = ln.Transform.get(key=\"script-to-test-versioning.py\")\n    assert transform.latest_run.finished_at is not None\n\n\ndef test_run_external_script():\n    script_path = \"sub/lamin-cli/tests/scripts/run-track-and-finish-sync-git.py\"\n    result = subprocess.run(  # noqa: S602\n        f\"python {script_path}\",\n        shell=True,\n        capture_output=True,\n    )\n    print(result.stdout.decode())\n    print(result.stderr.decode())\n    assert result.returncode == 0\n    assert \"created Transform\" in result.stdout.decode()\n    assert \"started new Run\" in result.stdout.decode()\n    transform = ln.Transform.get(key=\"run-track-and-finish-sync-git.py\")\n    # the algorithm currently picks different commits depending on the state of the repo\n    # any of these commits are valid\n    assert transform.uid == \"m5uCHTTpJnjQ0000\"\n    assert transform.reference.endswith(\n        \"/tests/scripts/run-track-and-finish-sync-git.py\"\n    )\n    assert transform.reference.startswith(\n        \"https://github.com/laminlabs/lamin-cli/blob/\"\n    )\n    assert transform.reference_type == \"url\"\n    assert transform.description == \"My good script\"\n    # ensure that the source code is not saved as an output artifact\n    assert transform.latest_run.output_artifacts.count() == 0\n    assert transform.runs.count() == 1\n    assert transform.hash == \"VC1oTPcaVSrzNrXUT9p4qw\"\n\n\n@pytest.mark.parametrize(\"type\", [\"notebook\", \"script\"])\ndef test_track_notebook_or_script_manually(type):\n    transform = ln.Transform(key=\"My notebook\", kind=type)\n    with pytest.raises(ValueError) as error:\n        ln.track(transform=transform)\n    assert (\n        error.exconly()\n        == \"ValueError: Use `ln.track()` without passing transform in a notebook or script - metadata is automatically parsed\"\n    )\n\n\ndef test_clean_r_notebook_html():\n    orig_notebook_path = NOTEBOOKS_DIR / \"basic-r-notebook.Rmd.html\"\n    content = orig_notebook_path.read_text()\n    orig_notebook_path.write_text(content.replace(\"SHORTCUT\", get_shortcut()))\n    comparison_path = NOTEBOOKS_DIR / \"basic-r-notebook.Rmd.cleaned.html\"\n    compare = comparison_path.read_text()\n    comparison_path.unlink()\n    title_text, cleaned_path = clean_r_notebook_html(orig_notebook_path)\n    assert comparison_path == cleaned_path\n    assert title_text == \"My exemplary R analysis\"\n    assert compare == cleaned_path.read_text()  # check that things have been stripped\n    comparison_path.write_text(compare)\n    orig_notebook_path.write_text(content.replace(get_shortcut(), \"SHORTCUT\"))\n\n\ndef test_notebook_to_script_notebooknode_metadata(tmp_path):\n    \"\"\"Test that notebook_to_script handles NotebookNode metadata.\n\n    https://github.com/laminlabs/lamindb/issues/3480\n    \"\"\"\n    import nbformat\n    from lamindb._finish import notebook_to_script\n\n    nb = nbformat.v4.new_notebook()\n    nb.metadata[\"kernelspec\"] = nbformat.NotebookNode({\"display_name\": \"python3\"})\n    notebook_path = tmp_path / \"test.ipynb\"\n    nbformat.write(nb, notebook_path)\n\n    # This would raise RepresenterError without metadata.clear()\n    result = notebook_to_script(\"Test\", notebook_path)\n    assert result is not None\n    assert \"NotebookNode\" not in result\n\n\nclass MockRun:\n    def __init__(self, uid):\n        self.uid = uid\n        self.report = None\n        self.saved = False\n\n    def save(self):\n        self.saved = True\n\n\ndef test_logstream_tracker_multiple():\n    tracker1 = LogStreamTracker()\n    tracker2 = LogStreamTracker()\n    tracker3 = LogStreamTracker()\n\n    try:\n        # Start trackers one by one and print messages\n        print(\"Initial stdout\")\n\n        tracker1.start(MockRun(\"run1\"))\n        print(\"After starting tracker1\")\n\n        tracker2.start(MockRun(\"run2\"))\n        print(\"After starting tracker2\")\n\n        tracker3.start(MockRun(\"run3\"))\n        print(\"After starting tracker3\")\n\n        print(\"Testing stderr\", file=sys.stderr)\n\n        time.sleep(0.1)\n\n        # Clean up in reverse order\n        tracker3.finish()\n        tracker2.finish()\n        tracker1.finish()\n\n        # Verify log contents - each log should only contain messages after its start\n        expected_contents = {\n            1: [\n                \"After starting tracker1\",\n                \"After starting tracker2\",\n                \"After starting tracker3\",\n                \"Testing stderr\",\n            ],\n            2: [\"After starting tracker2\", \"After starting tracker3\", \"Testing stderr\"],\n            3: [\"After starting tracker3\", \"Testing stderr\"],\n        }\n\n        for i in range(1, 4):\n            log_path = Path(ln_setup.settings.cache_dir / f\"run_logs_run{i}.txt\")\n            with open(log_path) as f:\n                content = f.read()\n                print(f\"\\nContents of run{i} log:\")\n                print(content)\n                # Check each expected line is in the content\n                for expected_line in expected_contents[i]:\n                    assert expected_line in content, (\n                        f\"Expected '{expected_line}' in log {i}\"\n                    )\n\n                # Check earlier messages are NOT in the content\n                if i > 1:\n                    assert \"Initial stdout\" not in content\n                    assert \"After starting tracker\" + str(i - 1) not in content\n\n    finally:\n        # Cleanup\n        for i in range(1, 4):\n            log_path = Path(ln_setup.settings.cache_dir / f\"run_logs_run{i}.txt\")\n            if log_path.exists():\n                log_path.unlink()\n\n\ndef test_logstream_tracker_exception_handling():\n    tracker = LogStreamTracker()\n    original_excepthook = sys.excepthook\n    run = MockRun(\"error\")\n\n    try:\n        tracker.start(run)\n        print(\"Before error\")\n\n        # Create and capture exception info\n        exc_type = ValueError\n        exc_value = ValueError(\"Test error\")\n        exc_traceback = None\n        try:\n            raise exc_value\n        except ValueError:\n            exc_traceback = sys.exc_info()[2]\n\n        # Handle the exception - this will trigger cleanup\n        tracker.handle_exception(exc_type, exc_value, exc_traceback)\n\n        # Verify run status\n        assert run.saved\n        assert run.report is not None\n\n        # Verify the content was written before cleanup\n        content = run.report.cache().read_text()\n        print(\"Log contents:\", content)\n        assert \"Before error\" in content\n        assert \"ValueError: Test error\" in content\n        assert \"Traceback\" in content\n\n    finally:\n        tracker.finish()\n        sys.excepthook = original_excepthook\n        log_path = Path(ln_setup.settings.cache_dir / f\"run_logs_{run.uid}.txt\")\n        if log_path.exists():\n            log_path.unlink()\n\n\ndef test_logstream_tracker_cleanup_sigint_chains_to_keyboard_interrupt():\n    tracker = LogStreamTracker()\n    run = MockRun(\"sigint\")\n    original_excepthook = sys.excepthook\n\n    def raising_sigint_handler(signum, frame):\n        raise KeyboardInterrupt\n\n    try:\n        with (\n            patch(\n                \"signal.getsignal\",\n                side_effect=[signal.SIG_DFL, raising_sigint_handler],\n            ),\n            patch(\"signal.signal\"),\n            patch(\"lamindb._finish.save_run_logs\"),\n        ):\n            tracker.start(run)\n            with pytest.raises(KeyboardInterrupt):\n                tracker.cleanup(signo=signal.SIGINT, frame=None)\n    finally:\n        tracker.finish()\n        sys.excepthook = original_excepthook\n        log_path = Path(ln_setup.settings.cache_dir / f\"run_logs_{run.uid}.txt\")\n        if log_path.exists():\n            log_path.unlink()\n"
  },
  {
    "path": "tests/core/test_track_step.py",
    "content": "import concurrent.futures\nfrom pathlib import Path\nfrom typing import Iterable\n\nimport lamindb as ln\nimport pandas as pd\nimport pytest\n\n\n@ln.step()\ndef process_chunk(\n    chunk_id: int, artifact_param: ln.Artifact, records_params: Iterable[ln.Record]\n) -> str:\n    # Create a simple DataFrame\n    df = pd.DataFrame(\n        {\"id\": range(chunk_id * 10, (chunk_id + 1) * 10), \"value\": range(10)}\n    )\n    env_file = Path(\"file_with_same_hash.txt\")\n    env_file.write_text(\"1\")\n    ln.Artifact(env_file, description=\"file_with_same_hash\").save()\n    # Save it as an artifact\n    key = f\"chunk_{chunk_id}.parquet\"\n    artifact = ln.Artifact.from_dataframe(df, key=key).save()\n    return artifact.key\n\n\ndef test_step_parallel():\n    # Ensure no global run from a previous test (e.g. test_flow)\n    ln.context._run = None\n    with pytest.raises(RuntimeError) as err:\n        process_chunk(4)\n    assert (\n        err.exconly()\n        == \"RuntimeError: Please track the global run context before using @ln.step(): ln.track() or @ln.flow()\"\n    )\n\n    # Ensure tracking is on\n    ln.track()\n\n    # Number of parallel executions\n    n_parallel = 3\n\n    param_artifact = ln.Artifact(\".gitignore\", key=\"param_artifact\").save()\n    ln.Record(name=\"record1\").save(), ln.Record(name=\"record2\").save()\n    records_params = ln.Record.filter(name__startswith=\"record\")\n\n    # Use ThreadPoolExecutor for parallel execution\n    with concurrent.futures.ThreadPoolExecutor(max_workers=n_parallel) as executor:\n        # Submit all tasks\n        futures = [\n            executor.submit(process_chunk, i, param_artifact, records_params)\n            for i in range(n_parallel)\n        ]\n        # Get results as they complete\n        chunk_keys = [\n            future.result() for future in concurrent.futures.as_completed(futures)\n        ]\n\n    # Verify results\n    # Each execution should have created its own artifact with unique run\n    print(f\"Created artifacts with keys: {chunk_keys}\")\n    artifacts = [ln.Artifact.get(key=key) for key in chunk_keys]\n    same_hash_artifacts = ln.Artifact.filter(description=\"file_with_same_hash\")\n\n    # Check that we got the expected number of artifacts\n    assert len(artifacts) == n_parallel\n    assert (\n        len(same_hash_artifacts) == 1\n    )  # only one artifact with the same hash should exist\n\n    # Verify each artifact has its own unique run\n    runs = [artifact.run for artifact in artifacts]\n    run_ids = [run.id for run in runs]\n    print(f\"Run IDs: {run_ids}\")\n    assert len(set(run_ids)) == n_parallel  # all runs should be unique\n\n    # Verify each run has the correct start and finish times\n    for run in runs:\n        print(f\"Run details: {run}\")\n        assert run.started_at is not None\n        assert run.finished_at is not None\n        assert run.started_at < run.finished_at\n        assert run.status == \"completed\"\n        assert isinstance(run.params[\"chunk_id\"], int)\n        assert run.params[\"artifact_param\"].startswith(\n            f\"Artifact[{param_artifact.uid}]\"\n        )\n        assert run.params[\"records_params\"] == [\n            f\"Record[{record.uid}]\" for record in records_params\n        ]\n\n    # Clean up test artifacts\n    runs = []\n    for artifact in artifacts:\n        runs.append(artifact.run)\n        artifact.delete(permanent=True)\n    param_artifact.delete(permanent=True)\n    same_hash_artifacts[0].delete(permanent=True)\n    Path(\"file_with_same_hash.txt\").unlink()\n    for run in runs:\n        run.delete(permanent=True)\n\n    ln.context._uid = None\n    ln.context._run = None\n    ln.context._transform = None\n    ln.context._path = None\n"
  },
  {
    "path": "tests/core/test_transform.py",
    "content": "from pathlib import Path\nfrom unittest.mock import patch\n\nimport lamindb as ln\nimport pytest\n\n\ndef test_transform_recovery_based_on_hash():\n    transform1 = ln.Transform(key=\"my-transform\", source_code=\"1\").save()\n    transform2 = ln.Transform(key=\"my-transform\", source_code=\"1\")\n    assert transform1 == transform2\n    transform1.delete()\n    transform2 = ln.Transform(key=\"my-transform\", source_code=\"1\")\n    assert transform1 != transform2\n    transform1.delete(permanent=True)\n\n\ndef test_transform_recovery_based_on_key():\n    transform1 = ln.Transform(key=\"my-transform\").save()\n    transform2 = ln.Transform(key=\"my-transform\")\n    assert transform1 == transform2\n    transform1.delete()\n    transform2 = ln.Transform(key=\"my-transform\")\n    assert transform1 != transform2\n    transform1.delete(permanent=True)\n\n\ndef test_revise_transforms():\n    # attempt to create a transform with an invalid version\n    with pytest.raises(ValueError) as error:\n        transform = ln.Transform(key=\"My transform\", version=0)\n        assert (\n            error.exconly()\n            == \"ValueError: `version` parameter must be `None` or `str`, e.g., '0.1', '1',\"\n            \" '2', etc.\"\n        )\n\n    # create a versioned transform\n    transform = ln.Transform(key=\"My transform\", version=\"1\")\n    assert transform.version_tag == \"1\"\n    assert transform.version == \"1\"\n    assert len(transform.uid) == ln.Transform._len_full_uid == 16\n    assert len(transform.stem_uid) == ln.Transform._len_stem_uid == 12\n\n    transform.save()\n\n    # try to reload the same transform with the same uid\n    transform_reload = ln.Transform(uid=transform.uid, key=\"My transform updated name\")\n    assert transform_reload.id == transform.id\n    assert transform_reload.key == \"My transform\"  # unchanged, prints logging\n    transform_reload = ln.Transform(\n        uid=transform.uid, description=\"My transform updated name\"\n    )\n    assert transform_reload.id == transform.id\n    assert (\n        transform_reload.description == \"My transform updated name\"\n    )  # unchanged, prints logging\n\n    # create new transform from old transform\n    transform_r2 = ln.Transform(description=\"My 2nd transform\", revises=transform)\n    assert transform_r2.uid != transform.uid\n    assert transform_r2.uid.endswith(\"0001\")\n    transform_r2 = ln.Transform(description=\"My 2nd transform\", revises=transform)\n    assert transform_r2.uid != transform.uid\n    assert transform_r2.uid.endswith(\"0001\")\n    assert transform_r2.stem_uid == transform.stem_uid\n    assert transform_r2.version_tag is None\n    assert (\n        transform_r2.version == transform_r2.uid[-4:]\n    )  # version falls back to uid suffix\n    assert transform_r2.is_latest\n    assert transform.is_latest\n    transform_r2.save()\n    assert not transform.is_latest\n\n    # create new transform from newly versioned transform\n    transform_r3 = ln.Transform(\n        description=\"My transform\", revises=transform_r2, version=\"2\"\n    )\n    assert transform_r3.stem_uid == transform.stem_uid\n    assert transform_r3.version_tag == \"2\"\n    assert transform_r3.version == \"2\"\n\n    # default description\n    transform_r3 = ln.Transform(revises=transform_r2)\n    assert transform_r3.description == transform_r2.description\n\n    # revise by matching on `key`\n    key = \"my-notebook.ipynb\"\n    transform_r2.key = key\n    transform_r2.save()\n    assert transform_r2.is_latest\n    transform_r3 = ln.Transform(description=\"My transform\", key=key, version=\"2\")\n    assert transform_r3.uid[:-4] == transform_r2.uid[:-4]\n    assert transform_r3.uid.endswith(\"0001\")\n    # this only fires if source code was actually saved\n    transform_r2.source_code = \"something\"\n    transform_r2.save()\n    transform_r3 = ln.Transform(description=\"My transform\", key=key, version=\"2\")\n    assert transform_r3.uid[:-4] == transform_r2.uid[:-4]\n    assert transform_r3.uid.endswith(\"0002\")\n    assert transform_r3.stem_uid == transform_r2.stem_uid\n    assert transform_r3.key == key\n    assert transform_r3.version_tag == \"2\"\n    assert transform_r3.version == \"2\"\n    assert transform_r3.is_latest\n    # because the new transform isn't yet saved, the old transform still has\n    # is_latest = True\n    assert transform_r2.is_latest\n    assert transform_r3._revises is not None\n    transform_r3.save()\n    # now r2 is no longer the latest version, but need to re-fresh from db\n    transform_r2 = ln.Transform.get(transform_r2.uid)\n    assert not transform_r2.is_latest\n\n    # wrong transform type\n    with pytest.raises(TypeError) as error:\n        ln.Transform(revises=ln.Record(name=\"x\"))\n    assert error.exconly().startswith(\n        \"TypeError: `revises` has to be of type `Transform`\"\n    )\n\n    # wrong kwargs\n    with pytest.raises(ValueError) as error:\n        ln.Transform(x=1)\n        assert (\n            error.exconly()\n            == \"ValueError: Only key, description, version_tag, type, revises,\"\n            \" reference, reference_type can be passed, but you passed: {'x': 1}\"\n        )\n\n    # test that reference transform cannot be deleted\n    transform_r2.delete()\n    transform.delete()\n\n    # unversioned transform\n    transform = ln.Transform(key=\"My transform\")\n    assert transform.version_tag is None\n    assert transform.version == transform.uid[-4:]  # version falls back to uid suffix\n\n    # what happens if we don't save the old transform?\n    # add a test for it!\n    transform.save()\n\n    # create new transform from old transform\n    new_transform = ln.Transform(description=\"My new transform\", revises=transform)\n    assert transform.version_tag is None\n    assert transform.version == transform.uid[-4:]  # version falls back to uid suffix\n    assert new_transform.stem_uid == transform.stem_uid\n    assert new_transform.uid.endswith(\"0001\")\n    assert new_transform.version_tag is None\n    assert (\n        new_transform.version == new_transform.uid[-4:]\n    )  # version falls back to uid suffix\n\n    transform.delete(permanent=True)\n\n\ndef test_delete():\n    # prepare the creation of a transform with its artifacts\n    transform = ln.Transform(key=\"My transform\").save()\n    run = ln.Run(transform)\n    report_path = Path(\"report.html\")\n    with open(report_path, \"w\") as f:\n        f.write(\"a\")\n    environment_path = Path(\"environment.txt\")\n    with open(environment_path, \"w\") as f:\n        f.write(\"c\")\n    report = ln.Artifact(report_path, description=f\"Report of {run.uid}\").save()\n    report_path.unlink()\n    report_path = report.path\n    environment = ln.Artifact(environment_path, description=\"requirements.txt\").save()\n    environment_path.unlink()\n    environment_path = environment.path\n    transform.save()\n    run.report = report\n    run.environment = environment\n    run.save()\n    assert report_path.exists()\n    assert environment_path.exists()\n    # now delete everything (run artifacts are cleaned up in background subprocess)\n    transform.delete(permanent=True)\n    assert len(ln.Run.filter(id=run.id)) == 0\n    # Clean up orphan report/env artifacts if subprocess has not run yet\n    for art in [report, environment]:\n        a = ln.Artifact.filter(id=art.id).first()\n        if a is not None:\n            a.delete(permanent=True, storage=True)\n    assert not report_path.exists()\n    assert not environment_path.exists()\n    assert len(ln.Artifact.filter(id__in=[report.id, environment.id])) == 0\n\n\n# see test_composite_component in test_schema.py\ndef test_successor_predecessor():\n    predecessor = ln.Transform(key=\"predecessor\").save()\n    successor1 = ln.Transform(key=\"successor1\").save()\n    successor2 = ln.Transform(key=\"successor2\").save()\n    predecessor.successors.add(\n        successor1, successor2, through_defaults={\"config\": {\"param\": 42}}\n    )\n\n    assert len(predecessor.successors.all()) == 2\n    assert predecessor.links_successor.count() == 2\n    assert predecessor.links_successor.first().config == {\"param\": 42}\n    assert predecessor.links_successor.first().predecessor == predecessor\n    assert predecessor.predecessors.count() == 0\n    assert predecessor.links_predecessor.count() == 0\n\n    ln.models.transform.TransformTransform.filter(predecessor=predecessor).delete(\n        permanent=True\n    )\n\n    link = ln.models.transform.TransformTransform(\n        predecessor=predecessor, successor=successor1, config={\"param\": 42}\n    ).save()\n    assert link in predecessor.links_successor.all()\n    assert link in successor1.links_predecessor.all()\n    assert link.config == {\"param\": 42}\n\n    predecessor.delete(permanent=True)\n    successor1.delete(permanent=True)\n    successor2.delete(permanent=True)\n\n    assert ln.models.transform.TransformTransform.filter().count() == 0\n\n\ndef test_bulk_transform_permanent_delete(tmp_path):\n    \"\"\"Bulk Transform permanent delete deletes TransformProject, runs (and artifacts), then transforms.\"\"\"\n    transform = ln.Transform(key=\"Bulk transform delete\").save()\n    runs = [ln.Run(transform).save() for _ in range(2)]\n    report_files = [tmp_path / f\"bulk_report_{i}.txt\" for i in range(2)]\n    for f in report_files:\n        f.write_text(\"report content\")\n    report_artifacts = [\n        ln.Artifact(str(f), description=f\"report {i}\").save()\n        for i, f in enumerate(report_files)\n    ]\n    for run, art in zip(runs, report_artifacts):\n        run.report = art\n        run.save()\n    transform_id = transform.id\n    run_ids = [r.id for r in runs]\n    artifact_ids = [r.report_id for r in runs]\n\n    with patch(\"lamindb.models.run.subprocess.Popen\") as mock_popen:\n        ln.Transform.filter(id=transform_id).delete(permanent=True)\n        mock_popen.assert_called_once()\n        args = mock_popen.call_args[0][0]\n        ids_str = args[args.index(\"--ids\") + 1]\n        assert {int(x) for x in ids_str.split(\",\")} == set(artifact_ids)\n\n    assert ln.Transform.filter(id=transform_id).count() == 0\n    for rid in run_ids:\n        assert ln.Run.filter(id=rid).count() == 0\n    # With mock, cleanup subprocess did not run; clean up orphan report artifacts\n    for aid in artifact_ids:\n        art = ln.Artifact.filter(id=aid).first()\n        if art is not None:\n            art.delete(permanent=True, storage=False)\n\n\ndef test_single_transform_permanent_delete_delegates_to_queryset(tmp_path):\n    \"\"\"Single Transform permanent delete delegates to QuerySet and removes runs and artifacts.\"\"\"\n    transform = ln.Transform(key=\"Single transform delete\").save()\n    run = ln.Run(transform).save()\n    report_file = tmp_path / \"single_report.txt\"\n    report_file.write_text(\"report\")\n    report = ln.Artifact(str(report_file), description=\"report\").save()\n    run.report = report\n    run.save()\n    transform_id = transform.id\n    run_id = run.id\n    artifact_id = report.id\n\n    with patch(\"lamindb.models.run.subprocess.Popen\") as mock_popen:\n        transform.delete(permanent=True)\n        mock_popen.assert_called_once()\n        args = mock_popen.call_args[0][0]\n        ids_str = args[args.index(\"--ids\") + 1]\n        assert artifact_id in {int(x) for x in ids_str.split(\",\")}\n\n    assert ln.Transform.filter(id=transform_id).count() == 0\n    assert ln.Run.filter(id=run_id).count() == 0\n    # With mock, cleanup subprocess did not run; clean up orphan report artifact\n    art = ln.Artifact.filter(id=artifact_id).first()\n    if art is not None:\n        art.delete(permanent=True, storage=False)\n\n\ndef test_bulk_transform_soft_delete():\n    \"\"\"Bulk Transform soft delete sets branch_id=-1.\"\"\"\n    transform = ln.Transform(key=\"Bulk transform soft delete\").save()\n    ln.Run(transform).save()\n    transform_id = transform.id\n    ln.Transform.filter(id=transform_id).delete(permanent=False)\n    t = ln.Transform.filter(id=transform_id).one()\n    assert t.branch_id == -1\n    ln.Transform.filter(id=transform_id).delete(permanent=True)\n\n\ndef test_bulk_transform_permanent_delete_promotes_previous_version():\n    \"\"\"Bulk permanent delete of latest in a version family promotes the previous version.\"\"\"\n    v1 = ln.Transform(key=\"Bulk permanent delete version family\").save()\n    v2 = ln.Transform(revises=v1, key=\"Bulk permanent delete version family\").save()\n    assert v2.is_latest\n    stem_uid = v1.stem_uid\n\n    ln.Transform.filter(id=v2.id).delete(permanent=True)\n\n    assert ln.Transform.filter(id=v2.id).count() == 0\n    v1_after = ln.Transform.filter(uid__startswith=stem_uid).one()\n    assert v1_after.pk == v1.pk\n    assert v1_after.is_latest\n    v1.delete(permanent=True)\n\n\ndef test_bulk_transform_soft_delete_promotes_previous_version():\n    \"\"\"Bulk soft delete of latest in a version family promotes the previous version.\"\"\"\n    v1 = ln.Transform(key=\"Bulk soft delete version family\").save()\n    v2 = ln.Transform(revises=v1, key=\"Bulk soft delete version family\").save()\n    assert v2.is_latest\n    v2_id = v2.id\n    stem_uid = v1.stem_uid\n\n    ln.Transform.filter(id=v2_id).delete(permanent=False)\n\n    v2_after = ln.Transform.filter(id=v2_id).one()\n    assert v2_after.branch_id == -1\n    assert not v2_after.is_latest\n    v1.refresh_from_db()\n    assert v1.is_latest\n    assert ln.Transform.filter(uid__startswith=stem_uid).get(is_latest=True) == v1\n    # Clean up\n    v2_after.delete(permanent=True)\n    v1.delete(permanent=True)\n"
  },
  {
    "path": "tests/core/test_transform_from_git.py",
    "content": "import lamindb as ln\nimport pytest\n\nTEST_URL = \"https://github.com/openproblems-bio/task_batch_integration\"\n\n\ndef test_transform_from_git():\n    # test auto-inferred latest commit hash\n    transform1 = ln.Transform.from_git(url=TEST_URL, path=\"main.nf\")\n    assert transform1.source_code.startswith(f\"\"\"\\\nrepo: {TEST_URL}\npath: main.nf\ncommit:\"\"\")\n    assert transform1.key == \"openproblems-bio/task_batch_integration/main.nf\"\n    assert transform1.version_tag is None\n    assert transform1.description is None\n    assert transform1.reference.startswith(f\"{TEST_URL}/blob/\")\n    assert transform1.reference_type == \"url\"\n\n    # test checking out specific version\n    transform2 = ln.Transform.from_git(url=TEST_URL, path=\"main.nf\", version=\"v2.0.0\")\n    assert transform2.source_code.startswith(f\"\"\"\\\nrepo: {TEST_URL}\npath: main.nf\ncommit:\"\"\")\n    assert transform2.version_tag == \"v2.0.0\"\n    assert transform2.description is None\n    assert transform1.source_code != transform2.source_code\n    assert transform1.reference != transform2.reference\n\n    # test with description\n    transform2_with_desc = ln.Transform.from_git(\n        url=TEST_URL, path=\"main.nf\", version=\"v2.0.0\", description=\"Test description\"\n    )\n    assert transform2_with_desc.description == \"Test description\"\n    assert transform2_with_desc.version_tag == \"v2.0.0\"\n\n    # test sliding transform from branch\n    transform3 = ln.Transform.from_git(\n        url=TEST_URL, path=\"main.nf\", version=\"main\", branch=\"main\"\n    )\n    assert transform3.source_code.startswith(f\"\"\"\\\nrepo: {TEST_URL}\npath: main.nf\nbranch:\"\"\")\n    assert transform3.description is None\n    assert transform3.reference == f\"{TEST_URL}/tree/main/main.nf\"\n    assert transform3.reference_type == \"url\"\n\n\ndef test_transform_from_git_with_entrypoint():\n    # test auto-inferred latest commit hash\n    transform1 = ln.Transform.from_git(\n        url=TEST_URL, path=\"main.nf\", entrypoint=\"myentrypoint\"\n    )\n    assert transform1.source_code.startswith(f\"\"\"\\\nrepo: {TEST_URL}\npath: main.nf\nentrypoint: myentrypoint\ncommit:\"\"\")\n    assert transform1.description is None\n\n    # test with entrypoint and description\n    transform2 = ln.Transform.from_git(\n        url=TEST_URL,\n        path=\"main.nf\",\n        entrypoint=\"myentrypoint\",\n        description=\"Entrypoint description\",\n    )\n    assert transform2.description == \"Entrypoint description\"\n\n\ndef test_transform_custom_key_and_hash_lookup():\n    # test auto-inferred latest commit hash\n    transform1 = ln.Transform.from_git(\n        url=TEST_URL, path=\"main.nf\", key=\"mypipeline\"\n    ).save()\n    assert transform1.key == \"mypipeline\"\n    # trigger hash look up\n    transform2 = ln.Transform.from_git(url=TEST_URL, path=\"main.nf\", key=\"mypipeline2\")\n    assert transform1 == transform2\n    assert transform2.key == \"mypipeline\"\n    # trigger hash look up\n    transform2 = ln.Transform.from_git(\n        url=TEST_URL, path=\"main.nf\", key=\"mypipeline2\", skip_hash_lookup=True\n    )\n    assert transform1 != transform2\n    assert transform2.key == \"mypipeline2\"\n    transform1.delete(permanent=True)\n\n\ndef test_transform_from_git_failure_modes():\n    # invalid tag\n    with pytest.raises(ValueError) as error:\n        ln.Transform.from_git(\n            url=TEST_URL,\n            path=\"main.nf\",\n            version=\"invalid\",\n        )\n    assert error.exconly().startswith(\"ValueError: Failed to checkout version invalid\")\n\n    # invalid branch\n    with pytest.raises(ValueError) as error:\n        ln.Transform.from_git(\n            url=TEST_URL,\n            path=\"main.nf\",\n            branch=\"invalid\",\n        )\n    assert error.exconly().startswith(\"ValueError: Failed to checkout branch invalid\")\n"
  },
  {
    "path": "tests/core/test_view.py",
    "content": "import lamindb as ln\n\n\ndef test_view():\n    ln.view(modules=\"core\")\n    ln.view()\n"
  },
  {
    "path": "tests/curators/conftest.py",
    "content": "import shutil\nfrom time import perf_counter\n\nimport lamindb_setup as ln_setup\nimport pytest\n\n\ndef pytest_sessionstart():\n    t_execute_start = perf_counter()\n    ln_setup.init(storage=\"./test-curators-db\", modules=\"bionty\")\n    total_time_elapsed = perf_counter() - t_execute_start\n    print(f\"time to setup the instance: {total_time_elapsed:.1f}s\")\n\n\ndef pytest_sessionfinish(session: pytest.Session):\n    shutil.rmtree(\"./test-curators-db\")\n    ln_setup.delete(\"test-curators-db\", force=True)\n\n\n@pytest.fixture\ndef ccaplog(caplog):\n    \"\"\"Add caplog handler to our custom logger at session start.\"\"\"\n    from lamin_utils._logger import logger\n\n    logger.addHandler(caplog.handler)\n\n    yield caplog\n\n    logger.removeHandler(caplog.handler)\n"
  },
  {
    "path": "tests/curators/test_cellxgene_curation.py",
    "content": "from typing import Generator\n\nimport bionty as bt\nimport lamindb as ln\nimport pytest\n\n\n@pytest.fixture\ndef cellxgene_defaults() -> Generator:\n    ln.examples.cellxgene.save_cellxgene_defaults()\n\n    yield\n\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n    ln.ULabel.filter(type__isnull=False).delete(permanent=True)\n    for entity in [\n        bt.Disease,\n        bt.Ethnicity,\n        bt.DevelopmentalStage,\n        bt.Phenotype,\n        bt.CellType,\n        ln.ULabel,\n    ]:\n        entity.filter().delete(permanent=True)\n\n\ndef test_cellxgene_curation(cellxgene_defaults) -> None:\n    \"\"\"Tests validating a recent CELLxGENE dataset.\"\"\"\n    ln.examples.cellxgene.save_cellxgene_defaults()\n\n    cxg_schema = ln.examples.cellxgene.create_cellxgene_schema(\n        field_types=\"ontology_id\",\n        organism=\"mouse\",\n        spatial_library_id=\"Thymus_Visium_Exp3A_V2S1_3wk_B6-WT\",\n    )\n\n    adata = ln.examples.datasets.anndata_visium_mouse_cellxgene()\n\n    curator = ln.curators.AnnDataCurator(adata, cxg_schema)\n    curator.validate()\n\n    cxg_schema.delete(permanent=True)\n"
  },
  {
    "path": "tests/curators/test_curate_from_croissant.py",
    "content": "import shutil\n\nimport lamindb as ln\nimport pytest\n\n\n@pytest.mark.parametrize(\"filepath_prefix\", [None, \"test-curators-db/\"])\ndef test_curate_artifact_from_croissant(filepath_prefix: str | None):\n    croissant_path, dataset1_path = ln.examples.croissant.mini_immuno(\n        n_files=1, filepath_prefix=filepath_prefix\n    )\n    artifact1 = ln.integrations.curate_from_croissant(croissant_path)\n    assert (\n        artifact1.description\n        == \"Mini immuno dataset - A few samples from the immunology dataset\"\n    )\n    assert artifact1.key == \"mini_immuno.anndata.zarr\"\n    assert artifact1.version_tag == \"1.0\"\n    assert (\n        artifact1._key_is_virtual\n        if filepath_prefix is None\n        else not artifact1._key_is_virtual\n    )\n    license_label = artifact1.ulabels.get(\n        name=\"https://creativecommons.org/licenses/by/4.0/\"\n    )\n    project_label = artifact1.projects.get(name=\"Mini Immuno Project\")\n\n    # now mutate the dataset and create a new version\n    croissant_path, dataset1_path = ln.examples.croissant.mini_immuno(\n        n_files=1, filepath_prefix=filepath_prefix, strip_version=True\n    )\n    dummy_file_path = dataset1_path / \"dummy_file.txt\"\n    dummy_file_path.write_text(\"dummy file\")\n\n    artifact2 = ln.integrations.curate_from_croissant(croissant_path)\n    assert artifact2.description == artifact1.description\n    assert artifact2.key == artifact1.key\n    assert artifact2.version_tag is None\n    assert artifact2.stem_uid == artifact1.stem_uid\n    assert artifact2.uid != artifact1.uid\n    assert (\n        artifact2._key_is_virtual\n        if filepath_prefix is None\n        else not artifact1._key_is_virtual\n    )\n    license_label = artifact2.ulabels.get(\n        name=\"https://creativecommons.org/licenses/by/4.0/\"\n    )\n    project_label = artifact2.projects.get(name=\"Mini Immuno Project\")\n\n    shutil.rmtree(dataset1_path)\n    croissant_path.unlink()\n    artifact1.delete(permanent=True, storage=True)  # because of real storage key\n    project_label.delete(permanent=True)\n    license_label.delete(permanent=True)\n\n\ndef test_curate_collection_from_croissant():\n    croissant_path, dataset1_path, dataset2_path = ln.examples.croissant.mini_immuno(\n        n_files=2\n    )\n    collection = ln.integrations.curate_from_croissant(croissant_path)\n    croissant_path.unlink()\n    shutil.rmtree(dataset1_path)\n    dataset2_path.unlink()\n    artifact1 = collection.artifacts.get(key=\"mini_immuno.anndata.zarr\")\n    artifact2 = collection.artifacts.get(key=\"mini.csv\")\n    license_label = collection.ulabels.get(\n        name=\"https://creativecommons.org/licenses/by/4.0/\"\n    )\n    project_label = collection.projects.get(name=\"Mini Immuno Project\")\n\n    collection.delete(permanent=True)\n    artifact1.delete(permanent=True)\n    artifact2.delete(permanent=True)\n    project_label.delete(permanent=True)\n    license_label.delete(permanent=True)\n"
  },
  {
    "path": "tests/curators/test_curators_examples.py",
    "content": "import sys\nfrom pathlib import Path\n\ndocs_path = Path.cwd() / \"docs\" / \"scripts\"\nsys.path.append(str(docs_path))\n\nimport anndata as ad\nimport bionty as bt\nimport lamindb as ln\nimport pandas as pd\nimport pytest\nfrom lamindb.core import datasets\nfrom lamindb.errors import InvalidArgument, ValidationError\n\n\n@pytest.fixture(scope=\"module\")\ndef mini_immuno_schema():\n    # define labels\n    perturbation = ln.ULabel(name=\"Perturbation\", is_type=True).save()\n    ln.ULabel(name=\"DMSO\", type=perturbation).save()\n    ln.ULabel(name=\"IFNG\", type=perturbation).save()\n    ln.ULabel(name=\"ulabel_but_not_perturbation\").save()\n    ln.ULabel.from_values([\"sample1\", \"sample2\", \"sample3\"], create=True).save()\n    bt.CellType.from_source(name=\"B cell\").save()\n    bt.CellType.from_source(name=\"T cell\").save()\n\n    # in next iteration for attrs\n    ln.Feature(name=\"temperature\", dtype=float).save()\n    # ln.Feature(name=\"experiment\", dtype=\"cat[ULabel]\").save()\n    # ln.Feature(name=\"date_of_study\", dtype=\"date\").save()\n    # ln.Feature(name=\"study_note\", dtype=\"str\").save()\n\n    # define schema\n    schema = ln.Schema(\n        name=\"mini_immuno_obs_level_metadata_curator_tests\",\n        features=[\n            ln.Feature(name=\"perturbation\", dtype=perturbation).save(),\n            ln.Feature(name=\"sample_note\", dtype=str).save(),\n            ln.Feature(name=\"cell_type_by_expert\", dtype=bt.CellType).save(),\n            ln.Feature(name=\"cell_type_by_model\", dtype=bt.CellType).save(),\n        ],\n        index=ln.Feature(name=\"sample_label\", dtype=ln.ULabel).save(),\n    ).save()\n\n    yield schema\n\n    for af in ln.Artifact.filter():\n        af.delete(permanent=True)\n\n    from lamindb.models import SchemaComponent\n\n    SchemaComponent.filter().delete(permanent=True)\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n    bt.Gene.filter().delete(permanent=True)\n    ln.ULabel.filter(type__isnull=False).delete(permanent=True)\n    ln.ULabel.filter().delete(permanent=True)\n    bt.CellType.filter().delete(permanent=True)\n\n\n@pytest.fixture(scope=\"module\")\ndef curator_params():\n    \"\"\"Common curator parameters.\"\"\"\n    return {\n        \"categoricals\": {\n            \"perturbation\": ln.ULabel.name,\n            \"cell_type_by_expert\": bt.CellType.name,\n            \"cell_type_by_model\": bt.CellType.name,\n        },\n        \"organism\": \"human\",\n    }\n\n\n@pytest.fixture(scope=\"module\")\ndef mudata_papalexi21_subset_schema():\n    # define labels\n    perturbation = ln.ULabel(name=\"Perturbation\", is_type=True).save()\n    ln.ULabel(name=\"Perturbed\", type=perturbation).save()\n    ln.ULabel(name=\"NT\", type=perturbation).save()\n\n    replicate = ln.ULabel(name=\"Replicate\", is_type=True).save()\n    ln.ULabel(name=\"rep1\", type=replicate).save()\n    ln.ULabel(name=\"rep2\", type=replicate).save()\n    ln.ULabel(name=\"rep3\", type=replicate).save()\n\n    # define obs schema\n    obs_schema = ln.Schema(\n        name=\"mudata_papalexi21_subset_obs_schema\",\n        features=[\n            ln.Feature(name=\"perturbation\", dtype=perturbation).save(),\n            ln.Feature(name=\"replicate\", dtype=replicate).save(),\n        ],\n    ).save()\n\n    obs_schema_rna = ln.Schema(\n        name=\"mudata_papalexi21_subset_rna_obs_schema\",\n        features=[\n            ln.Feature(name=\"nCount_RNA\", dtype=int).save(),\n            ln.Feature(name=\"nFeature_RNA\", dtype=int).save(),\n            ln.Feature(name=\"percent.mito\", dtype=float).save(),\n        ],\n        coerce=True,\n    ).save()\n\n    obs_schema_hto = ln.Schema(\n        name=\"mudata_papalexi21_subset_hto_obs_schema\",\n        features=[\n            ln.Feature(name=\"nCount_HTO\", dtype=int).save(),\n            ln.Feature(name=\"nFeature_HTO\", dtype=int).save(),\n            ln.Feature(name=\"technique\", dtype=bt.ExperimentalFactor).save(),\n        ],\n        coerce=True,\n    ).save()\n\n    var_schema_rna = ln.Schema(\n        name=\"mudata_papalexi21_subset_rna_var_schema\",\n        itype=bt.Gene.symbol,\n        dtype=float,\n    ).save()\n\n    # define composite schema\n    mudata_schema = ln.Schema(\n        name=\"mudata_papalexi21_subset_mudata_schema\",\n        otype=\"MuData\",\n        slots={\n            \"obs\": obs_schema,\n            \"rna:obs\": obs_schema_rna,\n            \"hto:obs\": obs_schema_hto,\n            \"rna:var\": var_schema_rna,\n        },\n    ).save()\n\n    yield mudata_schema\n\n    for af in ln.Artifact.filter():\n        af.delete(permanent=True)\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n    bt.models.SchemaGene.filter().delete()\n    bt.Gene.filter().delete(permanent=True)\n    ln.ULabel.filter(type__isnull=False).delete(permanent=True)\n    ln.ULabel.filter().delete(permanent=True)\n    bt.ExperimentalFactor.filter().delete(permanent=True)\n\n\n@pytest.fixture(scope=\"module\")\ndef study_metadata_schema():\n    from define_schema_df_metadata import study_metadata_schema\n\n    yield study_metadata_schema\n\n    study_metadata_schema.delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\n@pytest.fixture(scope=\"module\")\ndef anndata_uns_schema():\n    from define_schema_anndata_uns import anndata_uns_schema\n\n    yield anndata_uns_schema\n\n    ln.Schema.filter().delete(permanent=True)\n    ln.Feature.filter().delete(permanent=True)\n\n\n@pytest.fixture(scope=\"module\")\ndef spatialdata_blobs_schema():\n    from define_schema_spatialdata import sdata_schema\n\n    yield sdata_schema\n\n    for af in ln.Artifact.filter():\n        af.delete(permanent=True)\n\n    from lamindb.models import SchemaComponent\n\n    SchemaComponent.filter().delete(permanent=True)\n\n    ln.Schema.filter().delete(permanent=True)\n    bt.models.SchemaGene.filter().delete()\n    bt.Gene.filter().delete(permanent=True)\n    ln.ULabel.filter(type__isnull=False).delete(permanent=True)\n    ln.ULabel.filter().delete(permanent=True)\n    bt.ExperimentalFactor.filter().delete(permanent=True)\n    bt.DevelopmentalStage.filter().delete(permanent=True)\n    bt.Disease.filter().delete(permanent=True)\n\n\ndef test_dataframe_curator(mini_immuno_schema: ln.Schema):\n    \"\"\"Test DataFrame curator implementation.\"\"\"\n\n    # Get the perturbation ULabel (created in mini_immuno_schema fixture)\n    perturbation = ln.ULabel.get(name=\"Perturbation\", is_type=True)\n\n    # invalid simple dtype (float)\n    feature_to_fail = ln.Feature(name=\"treatment_time_h\", dtype=float).save()\n    schema = ln.Schema(\n        name=\"mini_immuno_obs_level_metadata_v2\",\n        features=[\n            ln.Feature(name=\"perturbation\", dtype=perturbation).save(),\n            ln.Feature(name=\"sample_note\", dtype=str).save(),\n            ln.Feature(name=\"cell_type_by_expert\", dtype=bt.CellType).save(),\n            ln.Feature(name=\"cell_type_by_model\", dtype=bt.CellType).save(),\n            feature_to_fail,\n        ],\n    ).save()\n    df = datasets.mini_immuno.get_dataset1(otype=\"DataFrame\")\n    curator = ln.curators.DataFrameCurator(df, schema)\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert (\n        \"Column 'treatment_time_h' failed series or dataframe validator 0: <Check check_function: Column 'treatment_time_h' failed dtype check for 'float': got int64>\"\n        in error.exconly()\n    )\n\n    schema.delete(permanent=True)\n    feature_to_fail.delete(permanent=True)\n\n    # Wrong subtype\n    df = datasets.mini_immuno.get_dataset1(otype=\"DataFrame\", with_wrong_subtype=True)\n    curator = ln.curators.DataFrameCurator(df, mini_immuno_schema)\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert (\n        error.exconly()\n        == \"\"\"lamindb.errors.ValidationError: 1 term not validated in feature 'perturbation': 'ulabel_but_not_perturbation'\n    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('perturbation')\n    → a valid label for subtype 'Perturbation' has to be one of ['DMSO', 'IFNG']\"\"\"\n    )\n\n    # Typo\n    df = datasets.mini_immuno.get_dataset1(otype=\"DataFrame\", with_typo=True)\n    curator = ln.curators.DataFrameCurator(df, mini_immuno_schema)\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert (\n        error.exconly()\n        == \"\"\"lamindb.errors.ValidationError: 1 term not validated in feature 'perturbation': 'IFNJ'\n    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('perturbation')\n    → a valid label for subtype 'Perturbation' has to be one of ['DMSO', 'IFNG']\"\"\"\n    )\n\n    df = datasets.mini_immuno.get_dataset1(otype=\"DataFrame\")\n    curator = ln.curators.DataFrameCurator(df, mini_immuno_schema)\n    artifact = curator.save_artifact(key=\"examples/dataset1.parquet\")\n\n    assert artifact.schema == mini_immuno_schema\n    assert artifact.features.slots[\"columns\"].n_members == 5\n    assert (\n        artifact.features.describe(return_str=True)\n        == \"\"\"\\\nArtifact: examples/dataset1.parquet (0000)\n└── Dataset features\n    └── columns (5)\n        cell_type_by_expe…  bionty.CellType          B cell, CD8-positive, alph…\n        cell_type_by_model  bionty.CellType          B cell, T cell\n        perturbation        ULabel[Perturbation]     DMSO, IFNG\n        sample_label        ULabel                   sample1, sample2, sample3\n        sample_note         str\"\"\"\n    )\n    assert set(artifact.features.get_values()[\"sample_label\"]) == {\n        \"sample1\",\n        \"sample2\",\n        \"sample3\",\n    }\n    assert set(artifact.features.get_values()[\"cell_type_by_expert\"]) == {\n        \"CD8-positive, alpha-beta T cell\",\n        \"B cell\",\n    }\n    assert set(artifact.features.get_values()[\"cell_type_by_model\"]) == {\n        \"T cell\",\n        \"B cell\",\n    }\n\n    # a second dataset with missing values\n    ln.ULabel.from_values([\"sample4\", \"sample5\", \"sample6\"], create=True).save()\n    df = ln.examples.datasets.mini_immuno.get_dataset2(\n        otype=\"DataFrame\", gene_symbols_in_index=True\n    )\n    curator = ln.curators.DataFrameCurator(df, mini_immuno_schema)\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert \"column 'sample_note' not in dataframe\" in error.exconly()\n    assert \"column 'cell_type_by_expert' not in dataframe\" in error.exconly()\n\n    curator.standardize()\n    curator.validate()\n\n    artifact.delete(permanent=True)\n\n\ndef test_dataframe_curator_index():\n    \"\"\"Test validating a DataFrame index.\"\"\"\n    df = datasets.mini_immuno.get_dataset1(\n        otype=\"DataFrame\", with_index_type_mismatch=True\n    )\n    feature = ln.Feature(name=\"test\", dtype=\"str\").save()\n    schema = ln.Schema(index=feature).save()\n    curator = ln.curators.DataFrameCurator(df, schema)\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert \"expected series 'None' to have type str\" in error.exconly()\n\n    schema.delete(permanent=True)\n    feature.delete(permanent=True)\n\n\ndef test_dataframe_curator_validate_all_annotate_cat(mini_immuno_schema):\n    \"\"\"Do not pass any features.\"\"\"\n    schema = ln.Schema(itype=ln.Feature).save()\n    assert schema.flexible\n\n    df = datasets.mini_immuno.get_dataset1(otype=\"DataFrame\")\n    artifact = ln.Artifact.from_dataframe(\n        df, key=\"examples/dataset1.parquet\", schema=schema\n    ).save()\n    assert set(artifact.features.get_values()[\"perturbation\"]) == {\n        \"DMSO\",\n        \"IFNG\",\n    }\n    assert set(artifact.features.get_values()[\"cell_type_by_expert\"]) == {\n        \"CD8-positive, alpha-beta T cell\",\n        \"B cell\",\n    }\n    assert set(artifact.features.get_values()[\"cell_type_by_model\"]) == {\n        \"T cell\",\n        \"B cell\",\n    }\n\n    artifact.delete(permanent=True)\n    schema.delete(permanent=True)\n\n\ndef test_same_name_different_type():\n    \"\"\"The same feature names are allowed as long as they have different feature types.\"\"\"\n    type_a = ln.Feature(\n        name=\"TypeA\", is_type=True, description=\"Type A features\"\n    ).save()\n    type_b = ln.Feature(\n        name=\"TypeB\", is_type=True, description=\"Type B features\"\n    ).save()\n\n    assay_a = ln.Feature(name=\"assay name\", type=type_a, dtype=str).save()\n    assay_b = ln.Feature(name=\"assay name\", type=type_b, dtype=str).save()\n\n    schema = ln.Schema(\n        name=\"schema_a\",\n        features=[ln.Feature.get(name=\"assay name\", type=type_a)],\n        flexible=True,\n        otype=\"DataFrame\",\n    ).save()\n\n    df = pd.DataFrame({\"assay name\": [\"exp1\", \"exp2\"]})\n\n    artifact = ln.Artifact.from_dataframe(df, description=\"testdata\").save()\n\n    curator = ln.curators.DataFrameCurator(artifact, schema)\n    curator.save_artifact()\n\n    artifact.delete(permanent=True)\n    ln.Schema.filter(features__name=\"assay name\").delete(permanent=True)\n    schema.delete(permanent=True)\n    for feat in [assay_a, assay_b, type_a, type_b]:\n        feat.delete(permanent=True)\n\n\ndef test_dataframe_curator_validate_all_annotate_cat2(mini_immuno_schema):\n    \"\"\"Combine half-specifying features, half not.\"\"\"\n    schema = ln.Schema(\n        itype=ln.Feature,\n        features=[ln.Feature.get(name=\"perturbation\")],\n        flexible=True,\n    ).save()\n    assert schema.flexible\n\n    df = datasets.mini_immuno.get_dataset1(otype=\"DataFrame\")\n    curator = ln.curators.DataFrameCurator(df, schema)\n    artifact = curator.save_artifact(key=\"examples/dataset1.parquet\")\n    assert set(artifact.features.get_values()[\"perturbation\"]) == {\n        \"DMSO\",\n        \"IFNG\",\n    }\n    assert set(artifact.features.get_values()[\"cell_type_by_expert\"]) == {\n        \"CD8-positive, alpha-beta T cell\",\n        \"B cell\",\n    }\n    assert set(artifact.features.get_values()[\"cell_type_by_model\"]) == {\n        \"T cell\",\n        \"B cell\",\n    }\n\n    artifact.delete(permanent=True)\n    schema.delete(permanent=True)\n\n\n@pytest.mark.parametrize(\"include_attrs_slot\", [True, False])\ndef test_dataframe_attrs_validation(study_metadata_schema, include_attrs_slot):\n    df = datasets.mini_immuno.get_dataset1(otype=\"DataFrame\")\n\n    perturbation = ln.ULabel(name=\"Perturbation\", is_type=True).save()\n    perturbation_feature = ln.Feature(name=\"perturbation\", dtype=perturbation).save()\n    ln.ULabel(name=\"DMSO\", type=perturbation).save()\n    ln.ULabel(name=\"IFNG\", type=perturbation).save()\n\n    if include_attrs_slot:\n        schema = ln.Schema(\n            features=[perturbation_feature],\n            slots={\"attrs\": study_metadata_schema},\n            otype=\"DataFrame\",\n        ).save()\n    else:\n        schema = ln.Schema(\n            features=[perturbation_feature],\n            otype=\"DataFrame\",\n        ).save()\n\n    bad_schema = ln.Schema(\n        features=[perturbation_feature],\n        slots={\"doesnotexist\": schema},\n        otype=\"DataFrame\",\n    ).save()\n\n    with pytest.raises(ValueError) as e:\n        curator = ln.curators.DataFrameCurator(df, schema=bad_schema)\n    assert (\n        \"Slot 'doesnotexist' is not supported for DataFrameCurator. Must be 'attrs'.\"\n        in str(e.value)\n    )\n\n    curator = ln.curators.DataFrameCurator(df, schema=schema)\n\n    if include_attrs_slot:\n        assert curator.slots[\"attrs\"].__class__.__name__ == \"ComponentCurator\"\n    else:\n        assert not curator.slots\n\n    curator.validate()\n    artifact = curator.save_artifact(key=\"examples/df_with_attrs.parquet\")\n\n    assert artifact.schema == schema\n    if include_attrs_slot:\n        assert \"attrs\" in artifact.features.slots\n        assert artifact.features.slots[\"attrs\"].features.first() == ln.Feature.get(\n            name=\"temperature\"\n        )\n        assert artifact.features.slots[\"attrs\"].features.last() == ln.Feature.get(\n            name=\"experiment\"\n        )\n    else:\n        assert (\n            not hasattr(artifact.features, \"slots\")\n            or \"attrs\" not in artifact.features.slots\n        )\n\n    from lamindb.models import SchemaComponent\n\n    SchemaComponent.filter().delete(permanent=True)\n    artifact.delete(permanent=True)\n    bad_schema.delete(permanent=True)\n    schema.delete(permanent=True)\n\n\ndef test_schema_new_genes(ccaplog):\n    df = pd.DataFrame(\n        index=pd.Index(\n            [\n                \"ENSG00000139618\",  # BRCA2\n                \"ENSG00000141510\",  # TP53\n                \"ENSG00999000001\",  # Invalid ID\n                \"ENSG00999000002\",  # Invalid ID\n            ],\n            name=\"ensembl\",\n        )\n    )\n    feature = ln.Feature(name=\"ensembl\", dtype=bt.Gene.ensembl_gene_id).save()\n    schema = ln.Schema(index=feature).save()\n    curator = ln.curators.DataFrameCurator(df, schema)\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert error.exconly().startswith(\n        \"lamindb.errors.ValidationError: 2 terms not validated in feature 'index': 'ENSG00999000001', 'ENSG00999000002'\"\n    )\n\n    assert (\n        \"2 terms not validated in feature 'index': 'ENSG00999000001', 'ENSG00999000002'\"\n        in ccaplog.text\n    )\n\n    schema.delete(permanent=True)\n    feature.delete(permanent=True)\n\n\ndef test_schema_no_match_ensembl():\n    df = pd.DataFrame(\n        index=pd.Index(\n            [\n                \"ENSG99999999998\",  # Invalid ID\n                \"ENSG99999999999\",  # Invalid ID\n            ],\n            name=\"ensembl\",\n        )\n    )\n    schema = ln.Schema(\n        index=ln.Feature(name=\"ensembl\", dtype=bt.Gene.ensembl_gene_id).save()\n    ).save()\n    curator = ln.curators.DataFrameCurator(df, schema)\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert (\n        error.exconly()\n        == \"\"\"lamindb.errors.ValidationError: 2 terms not validated in feature 'index': 'ENSG99999999998', 'ENSG99999999999'\n    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('index')\"\"\"\n    )\n\n    schema.delete(permanent=True)\n\n\ndef test_schema_mixed_ensembl_symbols(ccaplog):\n    \"\"\"Quite some datasets have mixed ensembl gene IDs and symbols.\n\n    The expected behavior is that an error is raised when such a dataset is encountered because\n    currently LaminDB does not support validating values against a union of Fields.\n\n    The current behavior is that these cases automatically pass.\n    \"\"\"\n    df = pd.DataFrame(\n        index=pd.Index(\n            [\n                \"ENSG00000139618\",\n                \"ENSG00000141510\",\n                \"BRCA2\",  # symbol\n                \"TP53\",  # symbol\n            ],\n            name=\"ensembl\",\n        )\n    )\n    schema = ln.Schema(\n        index=ln.Feature(name=\"ensembl\", dtype=bt.Gene.ensembl_gene_id).save()\n    ).save()\n    curator = ln.curators.DataFrameCurator(df, schema)\n    with pytest.raises(ln.errors.ValidationError) as error:\n        curator.validate()\n    assert error.exconly().startswith(\n        \"lamindb.errors.ValidationError: 2 terms not validated in feature 'index': 'BRCA2', 'TP53'\"\n    )\n\n    assert \"2 terms not validated in feature 'index': 'BRCA2', 'TP53'\" in ccaplog.text\n\n    schema.delete(permanent=True)\n\n\ndef test_schema_mixed_features(ccaplog):\n    \"\"\"Test that union dtype features validate against multiple registries.\"\"\"\n\n    mixed_feature = ln.Feature(\n        name=\"mixed_feature\",\n        dtype=\"cat[bionty.Tissue.ontology_id|bionty.CellType.ontology_id]\",\n    ).save()\n\n    df_mixed = pd.DataFrame({\"mixed_feature\": [\"UBERON:0000178\", \"CL:0000540\"]})\n    mixed_schema = ln.Schema(features=[mixed_feature], coerce=True).save()\n\n    mixed_curator = ln.curators.DataFrameCurator(df_mixed, mixed_schema)\n    mixed_curator.validate()\n    assert mixed_curator._is_validated\n\n    assert bt.CellType.filter(ontology_id=\"CL:0000540\").exists()\n    assert bt.Tissue.filter(ontology_id=\"UBERON:0000178\").exists()\n\n    df_invalid = pd.DataFrame({\"mixed_feature\": [\"INVALID:0000000\"]})\n    invalid_curator = ln.curators.DataFrameCurator(df_invalid, mixed_schema)\n    with pytest.raises(ln.errors.ValidationError):\n        invalid_curator.validate()\n\n    mixed_schema.delete(permanent=True)\n    mixed_feature.delete(permanent=True)\n\n\ndef test_anndata_curator_different_components(mini_immuno_schema: ln.Schema):\n    obs_schema = mini_immuno_schema\n\n    for add_comp in [\"var.T\", \"obs\", \"uns\"]:\n        var_schema = ln.Schema(\n            name=\"scRNA_seq_var_schema\",\n            itype=bt.Gene.ensembl_gene_id,\n            dtype=\"num\",\n        ).save()\n\n        # always assume var\n        components = {\"var.T\": var_schema}\n        if add_comp == \"obs\":\n            components[\"obs\"] = obs_schema\n        if add_comp == \"uns\":\n            uns_schema = ln.Schema(\n                name=\"flexible_uns_schema\",\n                itype=ln.Feature,\n            ).save()\n            components[\"uns\"] = uns_schema\n\n        anndata_schema = ln.Schema(\n            name=\"mini_immuno_anndata_schema\",\n            otype=\"AnnData\",\n            slots=components,\n        ).save()\n        assert mini_immuno_schema.id is not None, mini_immuno_schema\n        assert anndata_schema.slots[\"var.T\"] == var_schema\n        if add_comp == \"obs\":\n            assert anndata_schema.slots[\"obs\"] == obs_schema\n        if add_comp == \"uns\":\n            assert anndata_schema.slots[\"uns\"] == uns_schema\n\n        describe_output = anndata_schema.describe(return_str=True)\n        assert \"mini_immuno_anndata_schema\" in describe_output\n        assert \"scRNA_seq_var_schema\" in describe_output\n        if add_comp == \"obs\":\n            assert \"mini_immuno_anndata_schema\" in describe_output\n        if add_comp == \"uns\":\n            assert \"flexible_uns_schema\" in describe_output\n\n        adata = datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\n        curator = ln.curators.AnnDataCurator(adata, anndata_schema)\n        assert curator.slots[\"var.T\"].__class__.__name__ == \"ComponentCurator\"\n        if add_comp == \"obs\":\n            assert curator.slots[\"obs\"].__class__.__name__ == \"ComponentCurator\"\n        if add_comp == \"uns\":\n            assert curator.slots[\"uns\"].__class__.__name__ == \"ComponentCurator\"\n\n        artifact = ln.Artifact.from_anndata(\n            adata, key=\"examples/dataset1.h5ad\", schema=anndata_schema\n        )\n        assert artifact._curator._is_validated  # important test, do not remove\n        artifact.save()\n        assert not hasattr(artifact, \"_curator\")  # test that curator is deleted\n        assert artifact.schema == anndata_schema\n        assert artifact.features.slots[\"var.T\"].n_members == 3  # 3 genes get linked\n        if add_comp == \"obs\":\n            assert artifact.features.slots[\"obs\"] == obs_schema\n            assert set(artifact.features.get_values()[\"cell_type_by_expert\"]) == {\n                \"CD8-positive, alpha-beta T cell\",\n                \"B cell\",\n            }\n            assert set(artifact.features.get_values()[\"cell_type_by_model\"]) == {\n                \"T cell\",\n                \"B cell\",\n            }\n        if add_comp == \"uns\":\n            assert artifact.features.slots[\"uns\"].features.first() == ln.Feature.get(\n                name=\"temperature\"\n            )\n\n        artifact.delete(permanent=True)\n        anndata_schema.delete(permanent=True)\n        var_schema.delete(permanent=True)\n\n\ndef test_anndata_curator_varT_curation():\n    ln.Schema.filter(itype=\"bionty.Gene.ensembl_gene_id\").delete()\n    varT_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save()\n    slot = \"var.T\"\n    components = {slot: varT_schema}\n    anndata_schema = ln.Schema(\n        otype=\"AnnData\",\n        slots=components,\n    ).save()\n    for with_gene_typo in [True, False]:\n        adata = datasets.mini_immuno.get_dataset1(\n            otype=\"AnnData\", with_gene_typo=with_gene_typo\n        )\n        if with_gene_typo:\n            with pytest.raises(ValidationError) as error:\n                artifact = ln.Artifact.from_anndata(\n                    adata, key=\"examples/dataset1.h5ad\", schema=anndata_schema\n                ).save()\n            assert error.exconly() == (\n                f\"lamindb.errors.ValidationError: 1 term not validated in feature 'columns' in slot '{slot}': 'GeneTypo'\\n\"\n                f\"    → fix typos, remove non-existent values, or save terms via: curator.slots['{slot}'].cat.add_new_from('columns')\"\n            )\n        else:\n            for n_max_records in [2, 4]:\n                ln.settings.annotation.n_max_records = n_max_records\n                artifact = ln.Artifact.from_anndata(\n                    adata, key=\"examples/dataset1.h5ad\", schema=anndata_schema\n                ).save()\n                assert (\n                    artifact.features.slots[slot].n_members == 3\n                )  # 3 genes get linked\n                assert (\n                    artifact.features.slots[slot].itype == \"bionty.Gene.ensembl_gene_id\"\n                )\n                if n_max_records == 2:\n                    assert not artifact.features.slots[slot].members.exists()\n                else:\n                    assert set(\n                        artifact.features.slots[slot]\n                        .members.to_dataframe()[\"ensembl_gene_id\"]\n                        .tolist()\n                    ) == {\n                        \"ENSG00000153563\",\n                        \"ENSG00000010610\",\n                        \"ENSG00000170458\",\n                    }\n\n                artifact.delete(permanent=True)\n\n            anndata_schema.delete(permanent=True)\n            varT_schema.delete(permanent=True)\n\n\ndef test_anndata_curator_varT_curation_legacy(ccaplog):\n    varT_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save()\n    slot = \"var\"\n    components = {slot: varT_schema}\n    anndata_schema = ln.Schema(\n        otype=\"AnnData\",\n        slots=components,\n    ).save()\n    for with_gene_typo in [True, False]:\n        adata = datasets.mini_immuno.get_dataset1(\n            otype=\"AnnData\", with_gene_typo=with_gene_typo\n        )\n        if with_gene_typo:\n            with pytest.raises(ValidationError) as error:\n                artifact = ln.Artifact.from_anndata(\n                    adata, key=\"examples/dataset1.h5ad\", schema=anndata_schema\n                ).save()\n            assert error.exconly() == (\n                f\"lamindb.errors.ValidationError: 1 term not validated in feature 'var_index' in slot '{slot}': 'GeneTypo'\\n\"\n                f\"    → fix typos, remove non-existent values, or save terms via: curator.slots['{slot}'].cat.add_new_from('var_index')\"\n            )\n        else:\n            artifact = ln.Artifact.from_anndata(\n                adata, key=\"examples/dataset1.h5ad\", schema=anndata_schema\n            ).save()\n            assert (\n                \"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}\"\n                in ccaplog.text\n            )\n            assert artifact.features.slots[slot].n_members == 3  # 3 genes get linked\n            assert set(\n                artifact.features.slots[slot].members.to_dataframe()[\"ensembl_gene_id\"]\n            ) == {\n                \"ENSG00000153563\",\n                \"ENSG00000010610\",\n                \"ENSG00000170458\",\n            }\n\n            artifact.delete(permanent=True)\n\n            anndata_schema.delete(permanent=True)\n            varT_schema.delete(permanent=True)\n\n\ndef test_anndata_curator_nested_uns(study_metadata_schema, anndata_uns_schema):\n    \"\"\"Test AnnDataCurator with nested uns slot validation.\"\"\"\n    adata = datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\n    adata.uns[\"study_metadata\"] = adata.uns.copy()\n\n    curator = ln.curators.AnnDataCurator(adata, anndata_uns_schema)\n    assert curator.slots[\"uns:study_metadata\"].__class__.__name__ == \"ComponentCurator\"\n\n    curator.validate()\n    artifact = curator.save_artifact(key=\"examples/anndata_with_uns.h5ad\")\n\n    assert artifact.schema == anndata_uns_schema\n    assert \"uns:study_metadata\" in artifact.features.slots\n    assert artifact.features.slots[\n        \"uns:study_metadata\"\n    ].features.first() == ln.Feature.get(name=\"temperature\")\n\n    adata = datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\n    bad_schema1 = ln.Schema(\n        otype=\"AnnData\",\n        slots={\"uns:nonexistent\": study_metadata_schema},\n    ).save()\n    with pytest.raises(InvalidArgument) as e:\n        ln.curators.AnnDataCurator(adata, bad_schema1)\n    assert (\n        \"Schema slot 'uns:nonexistent' requires keys uns['nonexistent'] but key 'nonexistent' not found.\"\n        in str(e.value)\n    )\n\n    with pytest.raises(InvalidArgument) as e:\n        bad_schema2 = ln.Schema(\n            otype=\"AnnData\",\n            slots={\"uns:temperature:nonexistent_nested\": study_metadata_schema},\n        ).save()\n        ln.curators.AnnDataCurator(adata, bad_schema2)\n    assert (\n        \"Schema slot 'uns:temperature:nonexistent_nested' requires keys uns['temperature']['nonexistent_nested'] but key 'nonexistent_nested' not found. Available keys at this level: none (not a dict).\"\n        in str(e.value)\n    )\n\n    inferred_sets = artifact.schemas.all()\n    for inferred_set in inferred_sets:\n        artifact.schemas.remove(inferred_set)\n    artifact.delete(permanent=True)\n    bad_schema1.delete(permanent=True)\n    bad_schema2.delete(permanent=True)\n    anndata_uns_schema.delete(permanent=True)\n\n\ndef test_anndata_curator_no_var(mini_immuno_schema: ln.Schema):\n    assert mini_immuno_schema.id is not None, mini_immuno_schema\n    # test no var schema\n    anndata_schema_no_var = ln.Schema(\n        name=\"mini_immuno_anndata_schema_no_var\",\n        otype=\"AnnData\",\n        slots={\"obs\": mini_immuno_schema},\n    ).save()\n    assert mini_immuno_schema.id is not None, mini_immuno_schema\n    adata = datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\n    curator = ln.curators.AnnDataCurator(adata, anndata_schema_no_var)\n\n    artifact = curator.save_artifact(key=\"examples/dataset1_no_var.h5ad\")\n    artifact.delete(permanent=True)\n    anndata_schema_no_var.delete(permanent=True)\n\n\ndef test_mudata_curator(\n    mudata_papalexi21_subset_schema: ln.Schema, mini_immuno_schema: ln.Schema\n):\n    mudata_schema = mudata_papalexi21_subset_schema\n    mdata = ln.examples.datasets.mudata_papalexi21_subset()\n    # wrong dataset\n    with pytest.raises(InvalidArgument):\n        ln.curators.MuDataCurator(pd.DataFrame(), mudata_schema)\n    # wrong schema\n    with pytest.raises(InvalidArgument):\n        ln.curators.MuDataCurator(mdata, mini_immuno_schema)\n    try:\n        # TODO: allow set cat_filters for a Schema with itype\n        bt.settings.organism = \"human\"\n        curator = ln.curators.MuDataCurator(mdata, mudata_schema)\n        assert curator.slots.keys() == {\n            \"obs\",\n            \"rna:obs\",\n            \"hto:obs\",\n            \"rna:var\",\n        }\n        curator.validate()\n        curator.slots[\"rna:var\"].cat.standardize(\"columns\")\n        curator.slots[\"rna:var\"].cat.add_new_from(\"columns\")\n        artifact = curator.save_artifact(key=\"mudata_papalexi21_subset.h5mu\")\n        assert artifact.schema == mudata_schema\n        assert set(artifact.features.slots.keys()) == {\n            \"obs\",\n            \"rna:var\",\n            \"rna:obs\",\n            \"hto:obs\",\n        }\n\n        artifact.delete(permanent=True)\n        mudata_schema.delete(permanent=True)\n        mini_immuno_schema.delete(permanent=True)\n        Path(\"papalexi21_subset.h5mu\").unlink(missing_ok=True)\n    finally:\n        bt.settings.organism = None\n\n\ndef test_mudata_curator_nested_uns(study_metadata_schema):\n    \"\"\"Test MuData with nested uns slot validation.\n\n    This test verifies the behavior of both the MuData `.uns` slots and a `.uns` slot of\n    an AnnData object inside the MuData object that gets specified using the key `:` syntax.\n    \"\"\"\n    mdata = ln.examples.datasets.mudata_papalexi21_subset(with_uns=True)\n\n    site_uns_schema = ln.Schema(\n        features=[\n            ln.Feature(name=\"pos\", dtype=float).save(),\n            ln.Feature(name=\"site_id\", dtype=str).save(),\n        ]\n    ).save()\n\n    mdata_schema = ln.Schema(\n        otype=\"MuData\",\n        slots={\n            \"uns:study_metadata\": study_metadata_schema,\n            \"rna:uns:site_metadata\": site_uns_schema,\n        },\n    ).save()\n\n    curator = ln.curators.MuDataCurator(mdata, mdata_schema)\n    assert curator.slots[\"uns:study_metadata\"].__class__.__name__ == \"ComponentCurator\"\n    assert (\n        curator.slots[\"rna:uns:site_metadata\"].__class__.__name__ == \"ComponentCurator\"\n    )\n\n    curator.validate()\n    artifact = curator.save_artifact(key=\"examples/mdata_with_uns.h5mu\")\n\n    assert artifact.schema == mdata_schema\n    assert \"uns:study_metadata\" in artifact.features.slots\n    assert \"rna:uns:site_metadata\" in artifact.features.slots\n    assert artifact.features.slots[\n        \"uns:study_metadata\"\n    ].features.first() == ln.Feature.get(name=\"temperature\")\n    assert artifact.features.slots[\n        \"rna:uns:site_metadata\"\n    ].features.first() == ln.Feature.get(name=\"pos\")\n\n    # Clean up\n    artifact.delete(permanent=True)\n    Path(\"papalexi21_subset.h5mu\").unlink(missing_ok=True)\n\n\ndef test_spatialdata_curator(\n    spatialdata_blobs_schema: ln.Schema,\n):\n    spatialdata = ln.examples.datasets.spatialdata_blobs()\n\n    # wrong dataset\n    with pytest.raises(InvalidArgument):\n        ln.curators.SpatialDataCurator(pd.DataFrame(), spatialdata_blobs_schema)\n    # wrong schema - use an actual slot that exists\n    with pytest.raises(InvalidArgument):\n        ln.curators.SpatialDataCurator(\n            spatialdata, spatialdata_blobs_schema.slots[\"attrs:bio\"]\n        )\n\n    curator = ln.curators.SpatialDataCurator(spatialdata, spatialdata_blobs_schema)\n    with pytest.raises(ln.errors.ValidationError):\n        curator.validate()\n\n    spatialdata.tables[\"table\"].var.drop(index=\"ENSG00000999999\", inplace=True)\n    artifact = ln.Artifact.from_spatialdata(\n        spatialdata,\n        key=\"examples/spatialdata1.zarr\",\n        schema=spatialdata_blobs_schema,\n    ).save()\n    assert artifact.schema == spatialdata_blobs_schema\n    assert artifact.features.slots.keys() == {\n        \"attrs:bio\",\n        \"attrs:tech\",\n        \"attrs\",\n        \"tables:table:obs\",\n        \"tables:table:var.T\",\n    }\n    assert artifact.features.get_values()[\"disease\"] == \"Alzheimer disease\"\n    assert (\n        artifact.features.describe(return_str=True)\n        == \"\"\"Artifact: examples/spatialdata1.zarr (0000)\n└── Dataset features\n    ├── attrs:bio (2)\n    │   developmental_sta…  bionty.DevelopmentalSt…  adult stage\n    │   disease             bionty.Disease           Alzheimer disease\n    ├── attrs:tech (1)\n    │   assay               bionty.ExperimentalFac…  Visium Spatial Gene Expres…\n    ├── attrs (2)\n    │   bio                 dict\n    │   tech                dict\n    ├── tables:table:obs …\n    │   sample_region       str\n    └── tables:table:var.…\n        BRAF                num\n        BRCA2               num\"\"\"\n    )\n\n    artifact.delete(permanent=True)\n\n\ndef test_specific_source():\n    \"\"\"Test validation of ontology terms using cat_filters to specify organism-specific source.\"\"\"\n    obs_schema = ln.Schema(\n        features=[\n            ln.Feature(\n                name=\"developmental_stage_ontology_id\",\n                dtype=bt.DevelopmentalStage.ontology_id,\n                cat_filters={\n                    \"source\": bt.Source.filter(\n                        entity=\"bionty.DevelopmentalStage\", organism=\"mouse\"\n                    ).one()\n                },\n            ).save()\n        ],\n        coerce=True,\n        minimal_set=False,\n    ).save()\n\n    schema = ln.Schema(\n        slots={\"obs\": obs_schema}, otype=\"AnnData\", minimal_set=True, coerce=True\n    ).save()\n\n    adata = ad.AnnData(\n        obs=pd.DataFrame(\n            {\n                \"developmental_stage_ontology_id\": [\n                    \"MmusDv:0000142\",\n                    \"MmusDv:0000022\",\n                ]\n            }\n        ),\n        var=pd.DataFrame(index=[\"ENSMUSG00000022391\", \"ENSMUSG00000018569\"]),\n    )\n\n    curator = ln.curators.AnnDataCurator(adata, schema)\n    curator.validate()\n\n    schema.delete(permanent=True)\n"
  },
  {
    "path": "tests/curators/test_curators_remote.py",
    "content": "import lamindb as ln\n\n\ndef test_curator_remote():\n    lamindata_artifacts = ln.Artifact.connect(\"laminlabs/lamindata\")\n    curator = ln.curators.DataFrameCurator(\n        lamindata_artifacts.get(\"Ywz5JiVNHOWSJDiK\"),\n        schema=ln.examples.schemas.valid_features(),\n    )\n    curator.validate()\n"
  },
  {
    "path": "tests/curators/test_dataframe_curation.py",
    "content": "\"\"\"Test suite for accounting on bank transactions.\"\"\"\n\nimport datetime\n\nimport lamindb as ln\nimport pandas as pd\nimport pytest\n\n\n@pytest.fixture(scope=\"module\")\ndef transactions_schema():\n    # Labels\n    currency_type = ln.ULabel(name=\"Currency\", is_type=True).save()\n    usd = ln.ULabel(name=\"USD\", type=currency_type).save()\n    eur = ln.ULabel(name=\"EUR\", type=currency_type).save()\n\n    assert usd.type == currency_type\n    assert eur.type == currency_type\n\n    # Features\n    currency = ln.Feature(name=\"currency_name\", dtype=\"cat[ULabel[Currency]]\").save()\n    date = ln.Feature(name=\"date\", dtype=\"date\").save()\n    receipt_url = ln.Feature(name=\"receipt_url\", dtype=\"url\").save()\n\n    transaction_type = ln.Feature(name=\"Transaction\", is_type=True).save()\n    amount_usd = ln.Feature(\n        name=\"transaction_amount_usd_cent\", dtype=int, type=transaction_type\n    ).save()\n    amount_eur = ln.Feature(\n        name=\"transaction_amount_eur_cent\", dtype=int, type=transaction_type\n    ).save()\n\n    # Schema\n    schema = ln.Schema(\n        name=\"transaction_dataframe\",\n        otype=\"DataFrame\",\n        features=[\n            date,\n            amount_usd,\n            amount_eur,\n            currency,\n            receipt_url,\n        ],\n        coerce=True,\n    ).save()\n\n    yield schema\n\n    ln.Schema.filter(\n        features__name__in=[\n            \"transaction_amount_eur_cent\",\n            \"transaction_amount_usd_cent\",\n        ]\n    ).delete(permanent=True)\n    schema.delete(permanent=True)\n    amount_eur.delete(permanent=True)\n    amount_usd.delete(permanent=True)\n    transaction_type.delete(permanent=True)\n    date.delete(permanent=True)\n    receipt_url.delete(permanent=True)\n    currency.delete(permanent=True)\n    eur.delete(permanent=True)\n    usd.delete(permanent=True)\n    currency_type.delete(permanent=True)\n\n\n@pytest.fixture\ndef transactions_dataframe():\n    # Create sample data\n    data = {\n        \"date\": [\n            datetime.date(2024, 1, 1),\n            datetime.date(2024, 1, 2),\n            datetime.date(2024, 1, 3),\n            datetime.date(2024, 1, 4),\n            datetime.date(2024, 1, 5),\n        ],\n        \"transaction_amount_usd_cent\": [1000, 2000, 3000, 4000, 5000],\n        \"transaction_amount_eur_cent\": [850, 1700, 2550, 3400, 4250],\n        \"currency_name\": [\"USD\", \"EUR\", \"USD\", \"EUR\", \"USD\"],\n        \"receipt_url\": [\n            \"https://bank.example/tx/1\",\n            \"https://bank.example/tx/2\",\n            \"https://bank.example/tx/3\",\n            \"https://bank.example/tx/4\",\n            \"https://bank.example/tx/5\",\n        ],\n    }\n    return pd.DataFrame(data)\n\n\ndef test_schema_creation(transactions_schema):\n    \"\"\"Test if schema was created properly\"\"\"\n    schema = ln.Schema.get(name=\"transaction_dataframe\")\n    assert schema is not None\n    assert schema.otype == \"DataFrame\"\n    # check the order of the features\n    assert schema.members.to_list(\"name\") == [\n        \"date\",\n        \"transaction_amount_usd_cent\",\n        \"transaction_amount_eur_cent\",\n        \"currency_name\",\n        \"receipt_url\",\n    ]\n\n\ndef test_data_curation(\n    transactions_schema: ln.Schema, transactions_dataframe: ln.Schema\n):\n    \"\"\"Test if data curation works properly\"\"\"\n    curator = ln.curators.DataFrameCurator(transactions_dataframe, transactions_schema)\n    assert curator.validate() is None\n    # URLs are currently validated as string values.\n    assert transactions_dataframe[\"receipt_url\"].iloc[0] == \"https://bank.example/tx/1\"\n    artifact = curator.save_artifact(key=\"test_transaction_dataset.csv\")\n    assert artifact.suffix == \".csv\"\n    artifact.delete(permanent=True)\n\n\ndef test_missing_required_feature(transactions_schema: ln.Schema):\n    \"\"\"Test if validation fails for invalid data\"\"\"\n    data_missing_required_feature = {\n        \"date\": [datetime.date(2024, 1, 1)],\n        \"transaction_amount_usd_cent\": [1000],\n        \"currency_name\": [\"USD\"],\n        \"receipt_url\": [\"https://bank.example/tx/1\"],\n    }\n    invalid_df = pd.DataFrame(data_missing_required_feature)\n\n    schema = ln.Schema.get(name=\"transaction_dataframe\")\n    curator = ln.curators.DataFrameCurator(invalid_df, schema)\n\n    with pytest.raises(ln.errors.ValidationError) as err:\n        curator.validate()\n        message = \"column 'transaction_amount_eur_cent' not in dataframe. Columns in dataframe: ['date', 'transaction_amount_usd_cent', 'currency_name']\"\n        assert message in str(err)\n\n\ndef test_invalid_label(transactions_schema: ln.Schema):\n    \"\"\"Test if validation fails for invalid currency\"\"\"\n    # Create dataframe with invalid currency\n    invalid_data = {\n        \"date\": [datetime.date(2024, 1, 1)],\n        \"transaction_amount_usd_cent\": [1000],\n        \"transaction_amount_eur_cent\": [850],\n        \"currency_name\": [\"GBP\"],  # Invalid currency not in our labels\n        \"receipt_url\": [\"https://bank.example/tx/1\"],\n    }\n    invalid_df = pd.DataFrame(invalid_data)\n\n    schema = ln.Schema.get(name=\"transaction_dataframe\")\n    curator = ln.curators.DataFrameCurator(invalid_df, schema)\n\n    with pytest.raises(ln.errors.ValidationError):\n        curator.validate()\n    # exconly = \"\"\"lamindb.errors.ValidationError: 1 term is not validated: 'GBP'\n    # → fix typos, remove non-existent values, or save terms via .add_new_from(\"currency_name\")\"\"\"\n    # assert err.exconly() == exconly\n\n\ndef test_invalid_url_dtype(transactions_schema: ln.Schema):\n    \"\"\"Test if validation fails for non-string URL values.\"\"\"\n    invalid_data = {\n        \"date\": [datetime.date(2024, 1, 1)],\n        \"transaction_amount_usd_cent\": [1000],\n        \"transaction_amount_eur_cent\": [850],\n        \"currency_name\": [\"USD\"],\n        \"receipt_url\": [123],  # URL is currently validated as string dtype\n    }\n    invalid_df = pd.DataFrame(invalid_data)\n\n    schema = ln.Schema.get(name=\"transaction_dataframe\")\n    curator = ln.curators.DataFrameCurator(invalid_df, schema)\n\n    with pytest.raises(ln.errors.ValidationError) as err:\n        curator.validate()\n    assert \"receipt_url\" in str(err.value)\n"
  },
  {
    "path": "tests/integrations/conftest.py",
    "content": "import shutil\nfrom time import perf_counter\n\nimport lamindb_setup as ln_setup\nimport pytest\n\n\ndef pytest_sessionstart():\n    t_execute_start = perf_counter()\n    ln_setup.init(storage=\"./testdb-integrations\")\n    total_time_elapsed = perf_counter() - t_execute_start\n    print(f\"time to setup the instance: {total_time_elapsed:.1f}s\")\n\n\ndef pytest_sessionfinish(session: pytest.Session):\n    shutil.rmtree(\"./testdb-integrations\")\n    ln_setup.delete(\"testdb-integrations\", force=True)\n\n\n@pytest.fixture\ndef ccaplog(caplog):\n    \"\"\"Add caplog handler to our custom logger at session start.\"\"\"\n    from lamin_utils._logger import logger\n\n    logger.addHandler(caplog.handler)\n\n    yield caplog\n\n    logger.removeHandler(caplog.handler)\n"
  },
  {
    "path": "tests/integrations/test_lightning.py",
    "content": "import json\nimport shutil\nfrom pathlib import Path\nfrom typing import Any, Generator, cast\nfrom unittest.mock import MagicMock\n\nimport lamindb as ln\nimport lightning as pl\nimport pytest\nimport torch\nfrom django.db import connection\nfrom django.test.utils import CaptureQueriesContext\nfrom lamindb.integrations import lightning as ll\nfrom lamindb.models._feature_manager import FeatureManager\nfrom torch import nn\nfrom torch.utils.data import DataLoader, TensorDataset\n\n\n@pytest.fixture(autouse=True)\ndef cleanup_checkpoints() -> Generator[None, None, None]:\n    \"\"\"Clean up checkpoint files and directories after each test.\"\"\"\n    yield\n    checkpoints_dir = Path(\"checkpoints\")\n    if checkpoints_dir.exists():\n        shutil.rmtree(checkpoints_dir)\n\n\n@pytest.fixture(autouse=True, scope=\"session\")\ndef cleanup_test_dir() -> Generator[None, None, None]:\n    \"\"\"Clean up test directory after all tests.\"\"\"\n    yield\n    for dirname in (\"lightning_checkpoints\", \"test_lightning\", \"lightning_logs\"):\n        dirpath = Path(dirname)\n        if dirpath.exists():\n            shutil.rmtree(dirpath)\n\n\n@pytest.fixture\ndef simple_model() -> pl.LightningModule:\n    class SimpleModel(pl.LightningModule):\n        def __init__(self):\n            super().__init__()\n            self.layer = nn.Linear(10, 1)\n\n        def forward(self, x):\n            return self.layer(x)\n\n        def training_step(self, batch, batch_idx):\n            x, y = batch\n            loss = nn.functional.mse_loss(self(x), y)\n            self.log(\"train_loss\", loss)\n            return loss\n\n        def configure_optimizers(self):\n            return torch.optim.Adam(self.parameters())\n\n    return SimpleModel()\n\n\n@pytest.fixture\ndef dataloader() -> DataLoader:\n    return DataLoader(\n        TensorDataset(torch.randn(100, 10), torch.randn(100, 1)), batch_size=10\n    )\n\n\n@pytest.fixture\ndef dirpath(request: pytest.FixtureRequest) -> Generator[str, None, None]:\n    prefix = f\"lightning_checkpoints/{request.node.name}/\"\n\n    yield prefix\n\n    for af in ln.Artifact.filter(key__startswith=prefix):\n        af.delete(permanent=True, storage=True)\n    dirpath_path = Path(prefix)\n    if dirpath_path.exists():\n        shutil.rmtree(dirpath_path)\n\n\n@pytest.fixture(scope=\"session\")\ndef lightning_features() -> Generator[None, None, None]:\n    \"\"\"Create lightning features.\"\"\"\n    ll.save_lightning_features()\n\n    yield\n\n    if lightning_type := ln.Feature.filter(name=\"lamindb.lightning\").one_or_none():\n        for feat in ln.Feature.filter(type=lightning_type):\n            for af in ln.Artifact.filter(schemas__features=feat):\n                af.delete(permanent=True, storage=True)\n            # JSONValues are lingering and also need to be deleted\n            ln.models.RunJsonValue.filter(jsonvalue__feature=feat).delete(\n                permanent=True\n            )\n            ln.models.JsonValue.filter(feature=feat).delete(permanent=True)\n            feat.delete(permanent=True)\n\n\ndef test_checkpoint_basic(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n):\n    \"\"\"Checkpoint should create artifacts with semantic paths.\"\"\"\n    callback = ll.Checkpoint(dirpath=dirpath, monitor=\"train_loss\")\n    trainer = pl.Trainer(\n        max_epochs=2,\n        callbacks=[callback],\n        logger=False,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    prefix = callback.checkpoint_key_prefix + \"/\"\n    artifacts = ln.Artifact.filter(key__startswith=prefix)\n    assert len(artifacts) >= 1\n    for af in artifacts:\n        assert af.kind == \"model\"\n        assert af.key.startswith(prefix)\n\n\ndef test_checkpoint_with_features(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n):\n    \"\"\"Checkpoint should annotate artifacts with feature values.\"\"\"\n    ln.Feature(name=\"train_loss\", dtype=float).save()\n    ln.Feature(name=\"custom_param\", dtype=str).save()\n\n    ln.track()\n\n    callback = ll.Checkpoint(\n        dirpath=dirpath,\n        features={\n            \"artifact\": {\"train_loss\": None},\n            \"run\": {\"custom_param\": \"test_value\"},\n        },\n        monitor=\"train_loss\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=2,\n        callbacks=[callback],\n        logger=False,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    prefix = callback.checkpoint_key_prefix + \"/\"\n    artifacts = ln.Artifact.filter(key__startswith=prefix)\n    assert len(artifacts) >= 1\n    for af in artifacts:\n        values = af.features.get_values()\n        assert \"train_loss\" in values\n\n    assert ln.context.run.features.get_values()[\"custom_param\"] == \"test_value\"\n\n    ln.finish()\n\n\ndef test_checkpoint_missing_features(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n):\n    \"\"\"Checkpoint should raise an error when specified features do not exist.\"\"\"\n    callback = ll.Checkpoint(\n        dirpath=dirpath,\n        features={\"artifact\": {\"nonexistent_feature\": None}},\n        monitor=\"train_loss\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=1,\n        callbacks=[callback],\n        logger=False,\n    )\n\n    with pytest.raises(ValueError, match=\"Feature nonexistent_feature missing\"):\n        trainer.fit(simple_model, dataloader)\n\n\ndef test_checkpoint_auto_features(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    lightning_features: None,\n):\n    \"\"\"Checkpoint should auto-track lightning features if they exist.\"\"\"\n    callback = ll.Checkpoint(\n        dirpath=dirpath,\n        monitor=\"train_loss\",\n        save_top_k=2,\n    )\n    trainer = pl.Trainer(\n        max_epochs=3,\n        callbacks=[callback],\n        logger=False,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    prefix = callback.checkpoint_key_prefix + \"/\"\n    artifacts = ln.Artifact.filter(key__startswith=prefix)\n    assert len(artifacts) >= 1\n\n    for af in artifacts:\n        values = af.features.get_values()\n        assert \"is_best_model\" in values\n        assert \"is_last_model\" in values\n        assert \"score\" in values\n        assert \"model_rank\" in values\n\n\ndef test_checkpoint_auto_features_with_duplicate_score_name(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    lightning_features: None,\n):\n    \"\"\"Auto-tracking should work if a generic 'score' feature also exists.\"\"\"\n    ln.Feature(name=\"score\", dtype=float).save()\n\n    callback = ll.Checkpoint(\n        dirpath=dirpath,\n        monitor=\"train_loss\",\n        save_top_k=2,\n    )\n    trainer = pl.Trainer(\n        max_epochs=1,\n        callbacks=[callback],\n        logger=False,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    prefix = callback.checkpoint_key_prefix + \"/\"\n    artifacts = ln.Artifact.filter(key__startswith=prefix)\n    assert len(artifacts) >= 1\n\n\ndef test_checkpoint_best_model_with_duplicate_feature_names(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    lightning_features: None,\n):\n    \"\"\"Clearing best-model flags should work when duplicate feature names exist.\n\n    Regression test: when a Feature named 'is_best_model' exists both under the\n    lamindb.lightning type and without a type (or under a different type),\n    remove_values used to call Feature.get(name=...) which raised\n    MultipleObjectsReturned. The fix uses type-scoped Feature lookups.\n    \"\"\"\n    # Create a duplicate 'is_best_model' feature without the lightning type\n    ln.Feature(name=\"is_best_model\", dtype=bool).save()\n\n    callback = ll.Checkpoint(\n        dirpath=dirpath,\n        monitor=\"train_loss\",\n        save_top_k=2,\n        mode=\"min\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=3,\n        callbacks=[callback],\n        logger=False,\n    )\n    # This would raise MultipleObjectsReturned before the fix\n    trainer.fit(simple_model, dataloader)\n\n    prefix = callback.checkpoint_key_prefix + \"/\"\n    artifacts = ln.Artifact.filter(key__startswith=prefix)\n    assert len(artifacts) >= 1\n\n    best_count = sum(\n        1 for af in artifacts if af.features.get_values().get(\"is_best_model\") is True\n    )\n    assert best_count == 1\n    last_count = sum(\n        1 for af in artifacts if af.features.get_values().get(\"is_last_model\") is True\n    )\n    assert last_count == 1\n\n\ndef test_checkpoint_query_budget_scales_sublinearly_with_hparams(\n    dataloader: DataLoader, dirpath: str, lightning_features: None\n):\n    \"\"\"DB queries should not scale linearly with hparam count.\"\"\"\n\n    class ModelWithManyHparams(pl.LightningModule):\n        def __init__(self, n_hparams: int):\n            super().__init__()\n            self.layer = nn.Linear(10, 1)\n            self.save_hyperparameters({f\"hp_{i}\": i for i in range(n_hparams)})\n\n        def forward(self, x):\n            return self.layer(x)\n\n        def training_step(self, batch, batch_idx):\n            x, y = batch\n            loss = nn.functional.mse_loss(self(x), y)\n            self.log(\"train_loss\", loss)\n            return loss\n\n        def configure_optimizers(self):\n            return torch.optim.Adam(self.parameters())\n\n    def count_fit_queries(n_hparams: int) -> int:\n        model = ModelWithManyHparams(n_hparams)\n        callback = ll.Checkpoint(\n            dirpath=f\"{dirpath.rstrip('/')}/{n_hparams}/\", monitor=\"train_loss\"\n        )\n        trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False)\n        with CaptureQueriesContext(connection) as ctx:\n            trainer.fit(model, dataloader)\n        return len(ctx.captured_queries)\n\n    low_hparams_queries = count_fit_queries(2)\n    high_hparams_queries = count_fit_queries(40)\n    assert high_hparams_queries <= low_hparams_queries + 10\n\n\ndef test_model_rank_update_query_budget(\n    dirpath: str,\n    tmp_path: Path,\n    monkeypatch: pytest.MonkeyPatch,\n    lightning_features: None,\n):\n    \"\"\"Ranking should use batched feature reads.\"\"\"\n    callback = ll.Checkpoint(dirpath=dirpath, monitor=\"train_loss\", mode=\"min\")\n    # Provide a stub trainer so checkpoint_key_prefix can compute on-the-fly.\n    # Only _original_dirpath matters for key derivation here.\n    stub_trainer = MagicMock(spec=pl.Trainer)\n    stub_trainer.loggers = []\n    callback._trainer = stub_trainer\n    key_prefix = callback.checkpoint_key_prefix\n    created_artifacts = []\n\n    for i in range(8):\n        model_file = tmp_path / f\"model_{i}.ckpt\"\n        model_file.write_bytes(f\"checkpoint-{i}\".encode())\n        artifact = ln.Artifact(\n            model_file, key=f\"{key_prefix}/model_{i}.ckpt\", kind=\"model\"\n        )\n        artifact.save()\n        artifact.features.add_values({\"score\": float(i), \"model_rank\": i})\n        created_artifacts.append(artifact)\n\n    monkeypatch.setattr(FeatureManager, \"remove_values\", lambda *args, **kwargs: None)\n    monkeypatch.setattr(FeatureManager, \"add_values\", lambda *args, **kwargs: None)\n\n    with CaptureQueriesContext(connection) as ctx:\n        callback._feature_annotator.update_model_ranks(key_prefix, mode=\"min\")\n    assert len(ctx.captured_queries) <= 6\n\n    for artifact in created_artifacts:\n        artifact.delete(permanent=True, storage=True)\n\n\ndef test_checkpoint_best_model_tracking(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    lightning_features: None,\n):\n    \"\"\"Only one checkpoint should be marked as best model.\"\"\"\n    callback = ll.Checkpoint(\n        dirpath=dirpath,\n        monitor=\"train_loss\",\n        save_top_k=3,\n        mode=\"min\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=3,\n        callbacks=[callback],\n        logger=False,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    prefix = callback.checkpoint_key_prefix + \"/\"\n    artifacts = ln.Artifact.filter(key__startswith=prefix)\n    best_count = sum(\n        1 for af in artifacts if af.features.get_values().get(\"is_best_model\") is True\n    )\n    assert best_count == 1\n\n\ndef test_checkpoint_model_rank(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    lightning_features: None,\n):\n    \"\"\"Checkpoints should have correct model_rank (0 = best).\"\"\"\n    callback = ll.Checkpoint(\n        dirpath=dirpath,\n        monitor=\"train_loss\",\n        save_top_k=3,\n        mode=\"min\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=3,\n        callbacks=[callback],\n        logger=False,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    prefix = callback.checkpoint_key_prefix + \"/\"\n    artifacts = ln.Artifact.filter(key__startswith=prefix)\n    ranks = [af.features.get_values().get(\"model_rank\") for af in artifacts]\n    assert 0 in ranks  # best model has rank 0\n    last_count = sum(\n        1 for af in artifacts if af.features.get_values().get(\"is_last_model\") is True\n    )\n    assert last_count == 1\n\n\ndef test_checkpoint_last_model_points_to_last_saved_artifact(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    lightning_features: None,\n):\n    \"\"\"The artifact flagged as last model should be the last saved checkpoint artifact.\"\"\"\n    checkpoint = ll.Checkpoint(\n        dirpath=dirpath,\n        monitor=\"train_loss\",\n        save_top_k=3,\n        mode=\"min\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=3,\n        callbacks=[checkpoint],\n        logger=False,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    prefix = checkpoint.checkpoint_key_prefix + \"/\"\n    artifacts = list(ln.Artifact.filter(key__startswith=prefix))\n    last_artifacts = [\n        artifact\n        for artifact in artifacts\n        if artifact.features.get_values().get(\"is_last_model\") is True\n    ]\n\n    assert len(last_artifacts) == 1\n    assert checkpoint.last_checkpoint_artifact is not None\n    assert last_artifacts[0].id == checkpoint.last_checkpoint_artifact.id\n\n\ndef test_checkpoint_semantic_paths(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    lightning_features: None,\n):\n    \"\"\"Checkpoints should have semantic keys derived from dirpath.\"\"\"\n    callback = ll.Checkpoint(\n        dirpath=dirpath,\n        monitor=\"train_loss\",\n        save_top_k=3,\n    )\n    trainer = pl.Trainer(\n        max_epochs=3,\n        callbacks=[callback],\n        logger=False,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    prefix = callback.checkpoint_key_prefix + \"/\"\n    artifacts = ln.Artifact.filter(key__startswith=prefix)\n    assert len(artifacts) >= 1\n\n    for af in artifacts:\n        assert af.key.startswith(prefix)\n        values = af.features.get_values()\n        assert \"is_best_model\" in values\n        assert \"score\" in values\n\n\ndef test_callback_deprecated(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    tmp_path: Path,\n):\n    \"\"\"Deprecated Callback should still work.\"\"\"\n    key = f\"test/legacy/{tmp_path.name}/model.ckpt\"\n    path = tmp_path / \"model.ckpt\"\n\n    with pytest.warns(DeprecationWarning, match=\"use ll.Checkpoint instead\"):\n        callback = ll.Callback(path=path, key=key)\n\n    trainer = pl.Trainer(\n        max_epochs=1,\n        callbacks=[callback],\n        logger=False,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    artifacts = ln.Artifact.filter(key=key)\n    assert len(artifacts) >= 1\n    assert artifacts[0].kind == \"model\"\n\n    # cleanup\n    for af in artifacts:\n        af.delete(permanent=True, storage=True)\n\n\ndef test_checkpoint_overwrites_existing_artifact(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    tmp_path: Path,\n    monkeypatch: pytest.MonkeyPatch,\n):\n    \"\"\"Checkpoint with same key should transparently replace the existing artifact.\"\"\"\n    dummy = tmp_path / \"dummy.ckpt\"\n    dummy.write_bytes(b\"dummy\")\n    fixed_key = f\"{dirpath.rstrip('/')}/fixed.ckpt\"\n    ln.Artifact(dummy, key=fixed_key).save()\n    old_uid = ln.Artifact.filter(key=fixed_key).one().uid\n\n    callback = ll.Checkpoint(dirpath=dirpath)\n    monkeypatch.setattr(callback, \"resolve_artifact_key\", lambda **kwargs: fixed_key)\n    trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False)\n    trainer.fit(simple_model, dataloader)\n\n    new_artifact = ln.Artifact.filter(key=fixed_key).one()\n    assert new_artifact.uid != old_uid\n\n    for af in ln.Artifact.filter(key=fixed_key):\n        af.delete(permanent=True, storage=True)\n\n\ndef test_checkpoint_invalid_feature_keys(dirpath: str):\n    \"\"\"Checkpoint should raise on invalid feature keys.\"\"\"\n    with pytest.raises(ValueError, match=\"Invalid feature keys\"):\n        ll.Checkpoint(\n            dirpath=dirpath,\n            features={\"invalid_key\": {\"foo\": \"bar\"}},  # type: ignore\n        )\n\n\ndef test_checkpoint_hparams(dataloader: DataLoader, dirpath: str, lightning_features):\n    \"\"\"Checkpoint should auto-capture model hparams if features exist.\"\"\"\n\n    class ModelWithHparams(pl.LightningModule):\n        def __init__(self, hidden_size: int = 32, learning_rate: float = 0.001):\n            super().__init__()\n            self.save_hyperparameters()\n            self.layer = nn.Linear(10, hidden_size)\n            self.out = nn.Linear(hidden_size, 1)\n\n        def forward(self, x):\n            return self.out(torch.relu(self.layer(x)))\n\n        def training_step(self, batch, batch_idx):\n            x, y = batch\n            loss = nn.functional.mse_loss(self(x), y)\n            self.log(\"train_loss\", loss)\n            return loss\n\n        def configure_optimizers(self):\n            return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)\n\n    ln.Feature(name=\"hidden_size\", dtype=int).save()\n    ln.Feature(name=\"learning_rate\", dtype=float).save()\n\n    ln.track()\n\n    model = ModelWithHparams(hidden_size=64, learning_rate=0.01)\n    callback = ll.Checkpoint(dirpath=dirpath, monitor=\"train_loss\")\n    trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False)\n    trainer.fit(model, dataloader)\n\n    run = ln.context.run\n    run_features = run.features.get_values()\n    assert run_features[\"hidden_size\"] == 64\n    assert run_features[\"learning_rate\"] == 0.01\n\n    ln.finish()\n\n\ndef test_checkpoint_datamodule_hparams(\n    simple_model: pl.LightningModule, dirpath: str, lightning_features\n):\n    \"\"\"Checkpoint should auto-capture datamodule hparams if features exist.\"\"\"\n\n    class DataModuleWithHparams(pl.LightningDataModule):\n        def __init__(self, batch_size: int = 32, num_workers: int = 4):\n            super().__init__()\n            self.save_hyperparameters()\n\n        def train_dataloader(self):\n            return DataLoader(\n                TensorDataset(torch.randn(100, 10), torch.randn(100, 1)),\n                batch_size=self.hparams.batch_size,\n            )\n\n    ln.Feature(name=\"batch_size\", dtype=int).save()\n    ln.Feature(name=\"num_workers\", dtype=int).save()\n\n    ln.track()\n\n    datamodule = DataModuleWithHparams(batch_size=16, num_workers=2)\n    callback = ll.Checkpoint(dirpath=dirpath, monitor=\"train_loss\")\n    trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False)\n    trainer.fit(simple_model, datamodule=datamodule)\n\n    run = ln.context.run\n    run_features = run.features.get_values()\n    assert run_features[\"batch_size\"] == 16\n    assert run_features[\"num_workers\"] == 2\n\n    ln.finish()\n\n\ndef test_checkpoint_trainer_config(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    lightning_features: None,\n):\n    \"\"\"Checkpoint should auto-capture trainer config if features exist.\"\"\"\n    ln.track()\n\n    callback = ll.Checkpoint(\n        dirpath=dirpath,\n        monitor=\"train_loss\",\n        save_weights_only=True,\n        mode=\"min\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=5,\n        max_steps=100,\n        precision=\"32\",\n        accumulate_grad_batches=2,\n        gradient_clip_val=0.5,\n        callbacks=[callback],\n        logger=False,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    run_features = ln.context.run.features.get_values()\n    artifacts = ln.Artifact.filter(key__startswith=callback.checkpoint_key_prefix + \"/\")\n    assert run_features[\"max_epochs\"] == 5\n    assert run_features[\"max_steps\"] == 100\n    assert run_features[\"precision\"] == \"32-true\"\n    assert run_features[\"accumulate_grad_batches\"] == 2\n    assert run_features[\"gradient_clip_val\"] == 0.5\n    assert run_features[\"monitor\"] == \"train_loss\"\n    assert run_features[\"mode\"] == \"min\"\n    assert \"save_weights_only\" not in run_features\n\n    assert len(artifacts) >= 1\n    for artifact in artifacts:\n        artifact_features = artifact.features.get_values()\n        assert artifact_features[\"save_weights_only\"] is True\n        assert artifact_features[\"monitor\"] == \"train_loss\"\n        assert artifact_features[\"mode\"] == \"min\"\n\n    ln.finish()\n\n\ndef test_checkpoint_hparams_yaml_with_hparams(\n    dataloader: DataLoader,\n    dirpath: str,\n    tmp_path: Path,\n):\n    \"\"\"Checkpoint should save hparams.yaml when model has hyperparameters.\"\"\"\n    from lightning.pytorch.loggers import CSVLogger\n\n    class ModelWithHparams(pl.LightningModule):\n        def __init__(self, hidden_size: int = 32):\n            super().__init__()\n            self.save_hyperparameters()\n            self.layer = nn.Linear(10, hidden_size)\n            self.out = nn.Linear(hidden_size, 1)\n\n        def forward(self, x):\n            return self.out(torch.relu(self.layer(x)))\n\n        def training_step(self, batch, batch_idx):\n            x, y = batch\n            loss = nn.functional.mse_loss(self(x), y)\n            self.log(\"train_loss\", loss)\n            return loss\n\n        def configure_optimizers(self):\n            return torch.optim.Adam(self.parameters())\n\n    logger = CSVLogger(save_dir=tmp_path, name=\"test_logs\")\n\n    model = ModelWithHparams(hidden_size=64)\n    callback = ll.Checkpoint(dirpath=dirpath, monitor=\"train_loss\")\n    trainer = pl.Trainer(\n        max_epochs=1,\n        callbacks=[callback],\n        logger=logger,\n    )\n    trainer.fit(model, dataloader)\n\n    resolved_dirpath = callback.checkpoint_key_prefix\n    hparams_key = f\"{resolved_dirpath}/hparams.yaml\"\n    hparams_artifact = ln.Artifact.filter(key=hparams_key).one_or_none()\n\n    assert hparams_artifact is not None\n    assert hparams_artifact.description == \"Lightning run hyperparameters\"\n\n    # cleanup\n    hparams_artifact.delete(permanent=True)\n    shutil.rmtree(tmp_path / \"test_logs\", ignore_errors=True)\n\n\n@pytest.mark.parametrize(\n    (\"use_dirpath\", \"use_logger\"),\n    [\n        (True, True),\n        (False, True),\n        (True, False),\n        (False, False),\n    ],\n    ids=[\n        \"dirpath-logger\",\n        \"no-dirpath-logger\",\n        \"dirpath-no-logger\",\n        \"no-dirpath-no-logger\",\n    ],\n)\ndef test_key_layout_matrix(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    tmp_path: Path,\n    use_dirpath: bool,\n    use_logger: bool,\n):\n    \"\"\"Artifact keys must follow the base-prefix layout across all 4 configurations.\n\n    With ``run_uid_is_version=True`` and an active Lamin run, the expected\n    key layout is::\n\n        {base}/checkpoints/{ckpt_filename}\n        {base}/config.yaml              (when SaveConfigCallback is used)\n        {base}/checkpoints/hparams.yaml (when model has hyperparameters)\n\n    Where ``base`` is determined by:\n\n    ==============================  ==================================\n    Scenario                        Base prefix\n    ==============================  ==================================\n    dirpath set (± logger)          ``{dirpath}/{run_uid}``\n    no dirpath + logger             ``{save_dir_name}/{name}/{run_uid}``\n    no dirpath + no logger          ``{run_uid}``\n    ==============================  ==================================\n    \"\"\"\n    from lightning.pytorch.loggers import CSVLogger\n\n    class ParserStub:\n        def save(self, config, path, skip_none, overwrite, multifile):\n            del skip_none, overwrite, multifile\n            Path(path).write_text(json.dumps(config, indent=2))\n\n    dirpath = str(tmp_path / \"layout_test\")\n\n    ln.track()\n    run_uid = ln.context.run.uid\n\n    logger: CSVLogger | bool\n    logger_name = \"layout_exp\"\n    if use_logger:\n        logger = CSVLogger(save_dir=tmp_path, name=logger_name)\n    else:\n        logger = False\n\n    checkpoint = ll.Checkpoint(\n        dirpath=dirpath if use_dirpath else None,\n        monitor=\"train_loss\",\n        run_uid_is_version=True,\n    )\n    config = {\"trainer\": {\"max_epochs\": 1}}\n    save_config = ll.SaveConfigCallback(\n        parser=cast(Any, ParserStub()),\n        config=config,\n        config_filename=\"config.yaml\",\n    )\n\n    trainer = pl.Trainer(\n        max_epochs=1,\n        callbacks=[checkpoint, save_config],\n        logger=logger,\n        default_root_dir=tmp_path,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    # Determine expected base prefix\n    if use_dirpath:\n        expected_base = f\"{dirpath.rstrip('/')}/{run_uid}\"\n    elif use_logger:\n        expected_base = f\"{tmp_path.name}/{logger_name}/{run_uid}\"\n    else:\n        expected_base = run_uid\n\n    # Verify base_prefix\n    assert checkpoint.base_prefix == expected_base\n\n    # Verify checkpoint key prefix\n    expected_ckpt_prefix = f\"{expected_base}/checkpoints\"\n    assert checkpoint.checkpoint_key_prefix == expected_ckpt_prefix\n\n    # Verify checkpoint artifacts exist under the correct prefix\n    ckpt_artifacts = ln.Artifact.filter(key__startswith=expected_ckpt_prefix + \"/\")\n    assert len(ckpt_artifacts) >= 1\n    for af in ckpt_artifacts:\n        assert af.key.startswith(expected_ckpt_prefix + \"/\")\n\n    # Verify config artifact sits directly under the base prefix\n    expected_config_key = f\"{expected_base}/config.yaml\"\n    config_artifact = ln.Artifact.filter(key=expected_config_key).one_or_none()\n    assert config_artifact is not None, f\"Expected config at {expected_config_key}\"\n\n    # Cleanup\n    json_values = ln.models.JsonValue.filter(links_artifact__artifact=config_artifact)\n    ln.models.ArtifactJsonValue.filter(artifact=config_artifact).delete()\n    config_artifact.delete(permanent=True, storage=True)\n    json_values.delete(permanent=True)\n    for af in ckpt_artifacts:\n        af.delete(permanent=True, storage=True)\n    ln.finish()\n    if use_logger:\n        shutil.rmtree(tmp_path / logger_name, ignore_errors=True)\n\n\ndef test_run_uid_not_in_key_when_disabled(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    tmp_path: Path,\n):\n    \"\"\"With run_uid_is_version=False, the key should use the logger version as before.\"\"\"\n    from lightning.pytorch.loggers import CSVLogger\n\n    ln.track()\n\n    logger = CSVLogger(save_dir=tmp_path, name=\"no_uid_test\")\n    callback = ll.Checkpoint(monitor=\"train_loss\", run_uid_is_version=False)\n    trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=logger)\n    trainer.fit(simple_model, dataloader)\n\n    prefix = callback.checkpoint_key_prefix\n    assert \"version_0\" in prefix\n    assert prefix == f\"{tmp_path.name}/no_uid_test/version_0/checkpoints\"\n\n    artifacts = ln.Artifact.filter(key__startswith=prefix + \"/\")\n    assert len(artifacts) >= 1\n\n    for af in artifacts:\n        af.delete(permanent=True, storage=True)\n    ln.finish()\n    shutil.rmtree(tmp_path / \"no_uid_test\", ignore_errors=True)\n\n\ndef test_two_runs_same_logger_produce_different_keys(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    tmp_path: Path,\n):\n    \"\"\"Two tracked runs with the same logger config should not collide on keys.\"\"\"\n    from lightning.pytorch.loggers import CSVLogger\n\n    prefixes = []\n    for _ in range(2):\n        ln.track()\n        logger = CSVLogger(save_dir=tmp_path, name=\"collision_test\")\n        callback = ll.Checkpoint(monitor=\"train_loss\", run_uid_is_version=True)\n        trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=logger)\n        trainer.fit(simple_model, dataloader)\n        prefixes.append(callback.checkpoint_key_prefix)\n        ln.finish()\n\n    assert prefixes[0] != prefixes[1], \"Two runs should produce different key prefixes\"\n\n    for prefix in prefixes:\n        for af in ln.Artifact.filter(key__startswith=prefix + \"/\"):\n            af.delete(permanent=True, storage=True)\n    shutil.rmtree(tmp_path / \"collision_test\", ignore_errors=True)\n\n\n@pytest.mark.parametrize(\n    (\"use_dirpath\", \"logger_name\", \"key_source\"),\n    [\n        (False, \"my_experiment\", \"logger\"),\n        (False, None, \"checkpoints\"),\n        (True, \"should_not_appear\", \"dirpath\"),\n        (True, None, \"dirpath\"),\n    ],\n    ids=[\n        \"without-dirpath-with-logger\",\n        \"without-dirpath-without-logger\",\n        \"with-dirpath-with-logger\",\n        \"with-dirpath-without-logger\",\n    ],\n)\ndef test_checkpoint_artifact_key_prefix_matrix(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    tmp_path: Path,\n    use_dirpath: bool,\n    logger_name: str | None,\n    key_source: str,\n):\n    \"\"\"Checkpoint artifact keys should match the dirpath/logger configuration matrix.\"\"\"\n    from lightning.pytorch.loggers import CSVLogger\n\n    logger: CSVLogger | bool\n    if logger_name is None:\n        logger = False\n    else:\n        logger = CSVLogger(save_dir=tmp_path, name=logger_name)\n\n    callback = ll.Checkpoint(\n        dirpath=dirpath if use_dirpath else None,\n        monitor=\"train_loss\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=2,\n        callbacks=[callback],\n        logger=logger,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    prefix = callback.checkpoint_key_prefix\n    if key_source == \"logger\":\n        assert prefix == f\"{tmp_path.name}/{logger_name}/version_0/checkpoints\"\n    elif key_source == \"checkpoints\":\n        assert prefix == \"checkpoints\"\n    else:\n        assert prefix == f\"{dirpath.rstrip('/')}/checkpoints\"\n        if logger_name is not None:\n            assert logger_name not in prefix\n\n    artifacts = ln.Artifact.filter(key__startswith=prefix + \"/\")\n    assert len(artifacts) >= 1\n    for af in artifacts:\n        assert af.kind == \"model\"\n        assert af.key.startswith(prefix + \"/\")\n\n    if not use_dirpath:\n        for af in artifacts:\n            af.delete(permanent=True, storage=True)\n\n    if logger_name is not None:\n        shutil.rmtree(tmp_path / logger_name, ignore_errors=True)\n\n\ndef test_checkpoint_auto_features_without_dirpath(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    tmp_path: Path,\n    lightning_features: None,\n):\n    \"\"\"Auto-features (best model, score, rank) should work without dirpath.\"\"\"\n    from lightning.pytorch.loggers import CSVLogger\n\n    logger = CSVLogger(save_dir=tmp_path, name=\"auto_feat\")\n\n    callback = ll.Checkpoint(\n        monitor=\"train_loss\",\n        save_top_k=2,\n        mode=\"min\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=3,\n        callbacks=[callback],\n        logger=logger,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    prefix = callback.checkpoint_key_prefix\n    artifacts = ln.Artifact.filter(key__startswith=prefix + \"/\")\n    assert len(artifacts) >= 1\n\n    for af in artifacts:\n        values = af.features.get_values()\n        assert \"is_best_model\" in values\n        assert \"score\" in values\n        assert \"model_rank\" in values\n\n    best_count = sum(\n        1 for af in artifacts if af.features.get_values().get(\"is_best_model\") is True\n    )\n    assert best_count == 1\n\n    ranks = [af.features.get_values().get(\"model_rank\") for af in artifacts]\n    assert 0 in ranks\n\n    # cleanup\n    for af in artifacts:\n        af.delete(permanent=True, storage=True)\n    shutil.rmtree(tmp_path / \"auto_feat\", ignore_errors=True)\n\n\n@pytest.mark.parametrize(\n    (\"use_dirpath\", \"logger_name\", \"key_source\"),\n    [\n        (False, \"cli_logs\", \"logger\"),\n        (False, None, \"filename\"),\n        (True, \"cli_logs\", \"dirpath\"),\n        (True, None, \"dirpath\"),\n    ],\n    ids=[\n        \"without-dirpath-with-logger\",\n        \"without-dirpath-without-logger\",\n        \"with-dirpath-with-logger\",\n        \"with-dirpath-without-logger\",\n    ],\n)\ndef test_save_config_artifact_key_matrix(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    tmp_path: Path,\n    use_dirpath: bool,\n    logger_name: str | None,\n    key_source: str,\n):\n    \"\"\"Config artifacts should be stored under the base prefix (dirpath > logger > empty).\"\"\"\n    from lightning.pytorch.loggers import CSVLogger\n\n    class ParserStub:\n        def save(\n            self,\n            config,\n            path,\n            skip_none: bool,\n            overwrite: bool,\n            multifile: bool,\n        ) -> None:\n            del skip_none, overwrite, multifile\n            Path(path).write_text(json.dumps(config, indent=2))\n\n    logger: CSVLogger | bool\n    if logger_name is None:\n        logger = False\n    else:\n        logger = CSVLogger(save_dir=tmp_path, name=logger_name)\n\n    checkpoint = ll.Checkpoint(\n        dirpath=dirpath if use_dirpath else None,\n        monitor=\"train_loss\",\n    )\n    config = {\"trainer\": {\"max_epochs\": 1}, \"model\": {\"hidden_size\": 1}}\n    save_config = ll.SaveConfigCallback(\n        parser=cast(Any, ParserStub()),\n        config=config,\n        config_filename=\"config.yaml\",\n    )\n\n    trainer = pl.Trainer(\n        max_epochs=1,\n        callbacks=[checkpoint, save_config],\n        logger=logger,\n        default_root_dir=tmp_path,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    assert trainer.log_dir is not None\n    local_config_path = Path(trainer.log_dir) / \"config.yaml\"\n    assert local_config_path.exists()\n    assert \"max_epochs\" in local_config_path.read_text()\n    if use_dirpath:\n        assert dirpath.rstrip(\"/\") not in str(local_config_path)\n\n    if key_source == \"logger\":\n        assert logger_name is not None\n        config_key = f\"{tmp_path.name}/{logger_name}/version_0/config.yaml\"\n    elif key_source == \"dirpath\":\n        config_key = f\"{dirpath.rstrip('/')}/config.yaml\"\n    else:\n        config_key = \"config.yaml\"\n    config_artifact = ln.Artifact.filter(key=config_key).one_or_none()\n    assert config_artifact is not None\n    assert config_artifact.description == \"Lightning CLI config\"\n\n    checkpoint_artifacts = ln.Artifact.filter(\n        key__startswith=checkpoint.checkpoint_key_prefix + \"/\"\n    )\n    assert len(checkpoint_artifacts) >= 1\n\n    json_values = ln.models.JsonValue.filter(links_artifact__artifact=config_artifact)\n    ln.models.ArtifactJsonValue.filter(artifact=config_artifact).delete()\n    config_artifact.delete(permanent=True, storage=True)\n    json_values.delete(permanent=True)\n    for artifact in checkpoint_artifacts:\n        artifact.delete(permanent=True, storage=True)\n    shutil.rmtree(tmp_path / \"cli_logs\", ignore_errors=True)\n\n\ndef test_save_config_artifact_tracked_as_run_input(\n    simple_model: pl.LightningModule,\n    dataloader: DataLoader,\n    dirpath: str,\n    tmp_path: Path,\n):\n    \"\"\"Config artifacts should be tracked as run inputs while checkpoints stay outputs.\"\"\"\n\n    class ParserStub:\n        def save(\n            self,\n            config,\n            path,\n            skip_none: bool,\n            overwrite: bool,\n            multifile: bool,\n        ) -> None:\n            del skip_none, overwrite, multifile\n            Path(path).write_text(json.dumps(config, indent=2))\n\n    ln.track()\n\n    checkpoint = ll.Checkpoint(dirpath=dirpath, monitor=\"train_loss\")\n    save_config = ll.SaveConfigCallback(\n        parser=cast(Any, ParserStub()),\n        config={\"trainer\": {\"max_epochs\": 1}},\n        config_filename=\"config.yaml\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=1,\n        callbacks=[checkpoint, save_config],\n        logger=False,\n        default_root_dir=tmp_path,\n    )\n    trainer.fit(simple_model, dataloader)\n\n    run = ln.context.run\n    assert run is not None\n    assert checkpoint.last_config_artifact is not None\n    assert checkpoint.last_checkpoint_artifact is not None\n\n    config_artifact = checkpoint.last_config_artifact\n    checkpoint_artifact = checkpoint.last_checkpoint_artifact\n\n    assert config_artifact.run is None\n    assert run in config_artifact.input_of_runs.all()\n\n    assert checkpoint_artifact.run == run\n    assert checkpoint_artifact.input_of_runs.count() == 0\n\n    config_artifact.delete(permanent=True, storage=True)\n    checkpoint_artifact.delete(permanent=True, storage=True)\n    ln.finish()\n\n\ndef test_checkpoint_subclass_receives_artifact_events(\n    dataloader: DataLoader,\n    dirpath: str,\n    tmp_path: Path,\n):\n    \"\"\"Subclass hooks should receive checkpoint, config, and hparams artifacts.\"\"\"\n    from lightning.pytorch.loggers import CSVLogger\n\n    class ModelWithHparams(pl.LightningModule):\n        def __init__(self, hidden_size: int = 32):\n            super().__init__()\n            self.save_hyperparameters()\n            self.layer = nn.Linear(10, hidden_size)\n            self.out = nn.Linear(hidden_size, 1)\n\n        def forward(self, x):\n            return self.out(torch.relu(self.layer(x)))\n\n        def training_step(self, batch, batch_idx):\n            x, y = batch\n            loss = nn.functional.mse_loss(self(x), y)\n            self.log(\"train_loss\", loss)\n            return loss\n\n        def configure_optimizers(self):\n            return torch.optim.Adam(self.parameters())\n\n    class ParserStub:\n        def save(\n            self,\n            config,\n            path,\n            skip_none: bool,\n            overwrite: bool,\n            multifile: bool,\n        ) -> None:\n            del skip_none, overwrite, multifile\n            Path(path).write_text(json.dumps(config, indent=2))\n\n    class RecordingCheckpoint(ll.Checkpoint):\n        def __init__(self, **kwargs):\n            super().__init__(**kwargs)\n            self.saved_events: list[ll.ArtifactSavedEvent] = []\n\n        def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None:\n            self.saved_events.append(event)\n\n    logger = CSVLogger(save_dir=tmp_path, name=\"recording_logs\")\n    checkpoint = RecordingCheckpoint(dirpath=dirpath, monitor=\"train_loss\")\n    save_config = ll.SaveConfigCallback(\n        parser=cast(Any, ParserStub()),\n        config={\"trainer\": {\"max_epochs\": 1}},\n        config_filename=\"config.yaml\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=1,\n        callbacks=[checkpoint, save_config],\n        logger=logger,\n        default_root_dir=tmp_path,\n    )\n    trainer.fit(ModelWithHparams(), dataloader)\n\n    assert {event.kind for event in checkpoint.saved_events} >= {\n        \"checkpoint\",\n        \"config\",\n        \"hparams\",\n    }\n    assert checkpoint.last_checkpoint_artifact is not None\n    assert checkpoint.last_config_artifact is not None\n    assert checkpoint.last_hparams_artifact is not None\n    assert checkpoint.last_checkpoint_artifact.key.startswith(\n        checkpoint.checkpoint_key_prefix + \"/\"\n    )\n    assert checkpoint.last_config_artifact.key.endswith(\"/config.yaml\")\n    assert checkpoint.last_hparams_artifact.key == (\n        f\"{checkpoint.checkpoint_key_prefix}/hparams.yaml\"\n    )\n    checkpoint_event = next(\n        event for event in checkpoint.saved_events if event.kind == \"checkpoint\"\n    )\n    assert checkpoint_event.key.startswith(checkpoint.checkpoint_key_prefix + \"/\")\n    assert checkpoint_event.storage_uri == checkpoint.resolve_artifact_storage_uri(\n        checkpoint_event.artifact\n    )\n    assert checkpoint_event.storage_uri.endswith(\".ckpt\")\n\n    artifacts_by_key = {event.key: event.artifact for event in checkpoint.saved_events}\n    for artifact in artifacts_by_key.values():\n        ln.models.ArtifactJsonValue.filter(artifact=artifact).delete()\n        ln.models.JsonValue.filter(links_artifact__artifact=artifact).delete(\n            permanent=True\n        )\n        artifact.delete(permanent=True, storage=True)\n    shutil.rmtree(tmp_path / \"recording_logs\", ignore_errors=True)\n\n\ndef test_checkpoint_artifact_observers_receive_shared_events(\n    dataloader: DataLoader,\n    dirpath: str,\n    tmp_path: Path,\n):\n    \"\"\"Observers should see the same checkpoint/config/hparams events as subclasses.\"\"\"\n    from lightning.pytorch.loggers import CSVLogger\n\n    class ModelWithHparams(pl.LightningModule):\n        def __init__(self, hidden_size: int = 32):\n            super().__init__()\n            self.save_hyperparameters()\n            self.layer = nn.Linear(10, hidden_size)\n            self.out = nn.Linear(hidden_size, 1)\n\n        def forward(self, x):\n            return self.out(torch.relu(self.layer(x)))\n\n        def training_step(self, batch, batch_idx):\n            x, y = batch\n            loss = nn.functional.mse_loss(self(x), y)\n            self.log(\"train_loss\", loss)\n            return loss\n\n        def configure_optimizers(self):\n            return torch.optim.Adam(self.parameters())\n\n    class ParserStub:\n        def save(\n            self,\n            config,\n            path,\n            skip_none: bool,\n            overwrite: bool,\n            multifile: bool,\n        ) -> None:\n            del skip_none, overwrite, multifile\n            Path(path).write_text(json.dumps(config, indent=2))\n\n    class RecordingObserver:\n        def __init__(self):\n            self.saved_events: list[ll.ArtifactSavedEvent] = []\n\n        def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None:\n            self.saved_events.append(event)\n\n        def on_artifact_removed(self, event: ll.ArtifactRemovedEvent) -> None:\n            del event\n\n    observer = RecordingObserver()\n    logger = CSVLogger(save_dir=tmp_path, name=\"observer_logs\")\n    checkpoint = ll.Checkpoint(\n        dirpath=dirpath,\n        monitor=\"train_loss\",\n        artifact_observers=[observer],\n    )\n    save_config = ll.SaveConfigCallback(\n        parser=cast(Any, ParserStub()),\n        config={\"trainer\": {\"max_epochs\": 1}},\n        config_filename=\"config.yaml\",\n    )\n    trainer = pl.Trainer(\n        max_epochs=1,\n        callbacks=[checkpoint, save_config],\n        logger=logger,\n        default_root_dir=tmp_path,\n    )\n    trainer.fit(ModelWithHparams(), dataloader)\n\n    assert {event.kind for event in observer.saved_events} >= {\n        \"checkpoint\",\n        \"config\",\n        \"hparams\",\n    }\n    checkpoint_event = next(\n        event for event in observer.saved_events if event.kind == \"checkpoint\"\n    )\n    assert checkpoint_event.key.startswith(checkpoint.checkpoint_key_prefix + \"/\")\n    assert checkpoint_event.local_path.name.endswith(\".ckpt\")\n    assert checkpoint_event.storage_uri == checkpoint.resolve_artifact_storage_uri(\n        checkpoint_event.artifact\n    )\n    assert checkpoint.last_artifact_event is not None\n    assert checkpoint.get_last_artifact(\"config\") == checkpoint.last_config_artifact\n\n    artifacts_by_key = {event.key: event.artifact for event in observer.saved_events}\n    for artifact in artifacts_by_key.values():\n        ln.models.ArtifactJsonValue.filter(artifact=artifact).delete()\n        ln.models.JsonValue.filter(links_artifact__artifact=artifact).delete(\n            permanent=True\n        )\n        artifact.delete(permanent=True, storage=True)\n    shutil.rmtree(tmp_path / \"observer_logs\", ignore_errors=True)\n"
  },
  {
    "path": "tests/no_instance/conftest.py",
    "content": "import pytest\n\n\n@pytest.fixture\ndef ccaplog(caplog) -> pytest.LogCaptureFixture:\n    \"\"\"Add caplog handler to our custom logger at session start.\"\"\"\n    from lamin_utils._logger import logger\n\n    logger.addHandler(caplog.handler)\n\n    yield caplog\n\n    logger.removeHandler(caplog.handler)\n"
  },
  {
    "path": "tests/no_instance/test_connect_dynamic_import.py",
    "content": "def test_connect_dynamic_import(ccaplog):\n    import lamindb as ln\n\n    # this only currently works if not instance was configured in the environment\n    # in all other cases, we still trigger a reset_django() and hence django variables\n    # become stale in case of a dynamic import\n    assert ln.setup.settings.instance.slug == \"none/none\"\n\n    ln.connect(\"laminlabs/lamin-site-assets\")\n    assert \"connected in read-only mode\" in ccaplog.text\n    assert ln.Artifact.filter(key__startswith=\"blog\").count() > 0\n    ln.setup.disconnect()\n"
  },
  {
    "path": "tests/no_instance/test_import_side_effects.py",
    "content": "import importlib.util\nimport json\nimport os\nimport subprocess\nimport sys\nfrom pathlib import Path\n\nimport pytest\n\nREPO_ROOT = Path(__file__).resolve().parents[2]\nMODULE_NAMES = (\"anndata\", \"h5py\", \"pyarrow\")\nLIGHT_IMPORTS = {name: False for name in MODULE_NAMES}\n\n\nPROBE_CASES = [\n    (\n        \"storage package constants stay light\",\n        \"import lamindb.core.storage as storage\\n_ = storage.VALID_SUFFIXES\\n_ = storage.delete_storage\\n_ = storage.infer_filesystem\",\n        LIGHT_IMPORTS,\n        (),\n    ),\n    (\n        \"storage object helpers stay light\",\n        \"import lamindb.core.storage as storage\\n_ = storage.infer_suffix\\n_ = storage.write_to_disk\",\n        LIGHT_IMPORTS,\n        (),\n    ),\n    (\n        \"loaders basic helpers stay light\",\n        \"import lamindb.core.loaders as loaders\\n_ = loaders.load_json\\n_ = loaders.load_txt\\n_ = loaders.load_html\",\n        LIGHT_IMPORTS,\n        (),\n    ),\n    (\n        \"loaders tabular helpers stay light\",\n        \"import lamindb.core.loaders as loaders\\n_ = loaders.load_csv\\n_ = loaders.load_parquet\\n_ = loaders.load_tsv\",\n        LIGHT_IMPORTS,\n        (),\n    ),\n    (\n        \"loaders optional-format helpers stay light\",\n        \"import lamindb.core.loaders as loaders\\n_ = loaders.load_h5ad\\n_ = loaders.load_h5mu\\n_ = loaders.load_zarr\",\n        LIGHT_IMPORTS,\n        (),\n    ),\n    (\n        \"backed_access symbols stay light\",\n        \"from lamindb.core.storage._backed_access import BackedAccessor, backed_access, _open_dataframe\\n_ = BackedAccessor\\n_ = backed_access\\n_ = _open_dataframe\",\n        LIGHT_IMPORTS,\n        (),\n    ),\n    (\n        \"objects module import stays light\",\n        \"from lamindb.core.storage.objects import infer_suffix, write_to_disk\\n_ = infer_suffix\\n_ = write_to_disk\",\n        LIGHT_IMPORTS,\n        (),\n    ),\n    (\n        \"backed_access pyarrow dataframe path stays anndata-free\",\n        \"from upath import UPath\\nimport pyarrow as pa\\nimport pyarrow.parquet as pq\\nfrom lamindb.core.storage._backed_access import backed_access\\npath = UPath('test_import_side_effects.parquet')\\npq.write_table(pa.table({'col': [1]}), path.as_posix())\\ntry:\\n    _ = backed_access(path, engine='pyarrow')\\nfinally:\\n    if path.exists():\\n        path.unlink()\",\n        {\"anndata\": False, \"h5py\": False, \"pyarrow\": True},\n        (\"pyarrow\",),\n    ),\n    (\n        \"backed_access polars dataframe path stays light\",\n        \"from upath import UPath\\nfrom lamindb.core.storage._backed_access import backed_access\\npath = UPath('test_import_side_effects.csv')\\nwith path.open('w') as f:\\n    _ = f.write('col\\\\n1\\\\n')\\ntry:\\n    _ = backed_access(path, engine='polars')\\nfinally:\\n    if path.exists():\\n        path.unlink()\",\n        LIGHT_IMPORTS,\n        (\"polars\",),\n    ),\n]\n\n\ndef _probe_modules_loaded(code: str) -> dict[str, bool]:\n    env = os.environ.copy()\n    pythonpath = env.get(\"PYTHONPATH\")\n    env[\"PYTHONPATH\"] = (\n        str(REPO_ROOT)\n        if not pythonpath\n        else os.pathsep.join([str(REPO_ROOT), pythonpath])\n    )\n    probe_lines = [\n        \"import json\",\n        \"import sys\",\n        \"\",\n        f\"module_names = {MODULE_NAMES!r}\",\n        \"result = {name: (name in sys.modules) for name in module_names}\",\n        code,\n        'result.update({f\"{name}_after\": (name in sys.modules) for name in module_names})',\n        \"print(json.dumps(result))\",\n    ]\n    probe = \"\\n\".join(probe_lines)\n    completed = subprocess.run(\n        [sys.executable, \"-c\", probe],\n        check=True,\n        capture_output=True,\n        cwd=REPO_ROOT,\n        env=env,\n        text=True,\n    )\n    stdout_lines = [line for line in completed.stdout.splitlines() if line.strip()]\n    return json.loads(stdout_lines[-1])\n\n\ndef _assert_modules(\n    result: dict[str, bool], expected_after: dict[str, bool], label: str\n):\n    for module_name in MODULE_NAMES:\n        assert result[module_name] is False, (\n            f\"{label}: {module_name} loaded before probe\"\n        )\n        assert result[f\"{module_name}_after\"] is expected_after[module_name], (\n            f\"{label}: unexpected {module_name} import state\"\n        )\n\n\n@pytest.mark.parametrize(\n    (\"label\", \"code\", \"expected_after\", \"required_modules\"),\n    PROBE_CASES,\n)\ndef test_storage_import_side_effects(\n    label: str,\n    code: str,\n    expected_after: dict[str, bool],\n    required_modules: tuple[str, ...],\n):\n    missing_modules = [\n        module_name\n        for module_name in required_modules\n        if importlib.util.find_spec(module_name) is None\n    ]\n    if missing_modules:\n        pytest.skip(f\"missing optional dependency: {', '.join(missing_modules)}\")\n\n    result = _probe_modules_loaded(code)\n    _assert_modules(result, expected_after, label)\n"
  },
  {
    "path": "tests/no_instance/test_no_default_instance.py",
    "content": "import lamindb as ln\nimport pandas as pd\nimport pytest\nfrom lamindb_setup.errors import CurrentInstanceNotConfigured\n\n\ndef test_no_read_only_warning(ccaplog):\n    ln.Artifact.connect(\"laminlabs/lamindata\")\n    ln.DB(\"laminlabs/lamindata\")\n\n    assert \"connected in read-only mode\" not in ccaplog.text\n\n\ndef test_instance_not_connected():\n    assert ln.setup.settings.instance.slug == \"none/none\"\n\n    with pytest.raises(CurrentInstanceNotConfigured):\n        ln.Artifact.filter().count()\n\n\ndef test_query_artifacts_lamindata():\n    artifacts = ln.Artifact.connect(\"laminlabs/lamindata\")\n    n_artifacts = artifacts.count()\n    assert n_artifacts > 0\n    assert n_artifacts > artifacts.filter().count()\n\n\ndef test_get_artifact_lamindata():\n    artifact = ln.Artifact.connect(\"laminlabs/lamindata\").get(\n        key=\"example_datasets/small_dataset1.parquet\"\n    )\n    assert isinstance(artifact.load(), pd.DataFrame)\n"
  },
  {
    "path": "tests/permissions/conftest.py",
    "content": "import shutil\nfrom subprocess import DEVNULL, run\nfrom time import perf_counter\n\nimport lamindb_setup as ln_setup\nimport pytest\nfrom lamin_utils import logger\n\n\ndef pytest_sessionstart():\n    t_execute_start = perf_counter()\n    # these are called in separate scripts because can't change connection\n    # within the same python process due to django\n    # init instance and setup RLS\n    run(  # noqa: S602\n        \"python ./tests/permissions/scripts/setup_instance.py\",\n        shell=True,\n        capture_output=False,\n    )\n    # populate permissions and models via the admin connection\n    run(  # noqa: S602\n        \"python ./tests/permissions/scripts/setup_access.py\",\n        shell=True,\n        capture_output=False,\n    )\n\n    total_time_elapsed = perf_counter() - t_execute_start\n    print(f\"time to setup the instance: {total_time_elapsed:.1f}s\")\n\n\ndef pytest_sessionfinish(session: pytest.Session):\n    logger.set_verbosity(1)\n    shutil.rmtree(\"./default_storage_permissions\")\n    ln_setup.delete(\"lamindb-test-permissions\", force=True)\n    run(\"docker stop pgtest && docker rm pgtest\", shell=True, stdout=DEVNULL)  # noqa: S602\n"
  },
  {
    "path": "tests/permissions/jwt_utils.py",
    "content": "import json\n\nimport psycopg2\n\n\ndef sign_jwt(db_url, payload: dict) -> str:\n    with psycopg2.connect(db_url) as conn, conn.cursor() as cur:\n        cur.execute(\n            \"\"\"\n                SELECT sign(\n                    %s::json,\n                    (SELECT security.get_secret('jwt_secret')),\n                    %s\n                )\n                \"\"\",\n            (json.dumps(payload), \"HS256\"),\n        )\n        token = cur.fetchone()[0]\n        if not token:\n            msg = \"Failed to generate JWT\"\n            raise ValueError(msg)\n        return token\n"
  },
  {
    "path": "tests/permissions/scripts/check_lamin_dev.py",
    "content": "import subprocess\nfrom unittest.mock import patch\n\nimport lamindb as ln\nimport pytest\nfrom lamindb_setup.core._hub_core import select_space, select_storage\n\n\ndef cleanup(records):\n    for record in records:\n        try:\n            if isinstance(record, ln.Storage):\n                record.artifacts.all().delete(permanent=True)\n            record.delete(permanent=True)\n        except Exception as e:\n            print(f\"Failed deleting {record}: {e}\")\n\n\nassert ln.setup.settings.user.handle == \"testuser1\"\n\nln.connect(\"laminlabs/lamin-dev\")\n\nassert ln.setup.settings.instance.slug == \"laminlabs/lamin-dev\"\n\n# check that the rename resolves correctly (it was renamed)\nassert ln.Artifact.connect(\"laminlabs/lamin-dev1072025\").db == \"default\"\n\nspace_name = \"Our test space for CI\"\nspace = ln.Space.get(name=space_name)\n\n# check that we throw an error if no storage location is managed by the space\nstorage_loc = ln.Storage.filter(space=space).one_or_none()\nif storage_loc is not None:\n    ln.Run.filter(report__storage=storage_loc).delete(permanent=True)\n    storage_loc.artifacts.all().delete(permanent=True)\n    storage_loc.delete(permanent=True)\n\nwith pytest.raises(ln.errors.NoStorageLocationForSpace) as error:\n    ln.track(space=space_name)  # this fails to save the env artifact\n    ln.context._transform = None\n    ln.context._run = None\n\n# now create the storage location in the space\nstorage_loc = ln.Storage(\"create-s3\", space=space).save()\nln.track(space=space_name)\ntry:\n    assert ln.context.space.name == space_name\n    ulabel = ln.ULabel(name=\"My test ulabel in test space\").save()\n\n    # cleanup if the artifact already exists\n    artifact = ln.Artifact(\".gitignore\", key=\"mytest\")\n    if (\n        artifact_cleanup := ln.Artifact.filter(hash=artifact.hash).one_or_none()\n    ) is not None:\n        artifact_cleanup.delete(permanent=True)\n\n    # cleanup if the directory artifact already exists\n    artifact_dir = ln.Artifact(\"./scripts\", key=\"mytest-dir\")\n    if (\n        artifact_cleanup := ln.Artifact.filter(hash=artifact_dir.hash).one_or_none()\n    ) is not None:\n        artifact_cleanup.delete(permanent=True)\n\n    artifact = ln.Artifact(\".gitignore\", key=\"mytest\").save()\n    artifact_dir = ln.Artifact(\"./scripts\", key=\"mytest-dir\").save()\n\n    # check that exist\n    ln.ULabel.get(name=\"My test ulabel in test space\")\n    ln.Artifact.get(key=\"mytest\")\n    ln.Artifact.get(key=\"mytest-dir\")\n\n    assert ulabel.space == space  # ulabel should end up in the restricted space\n    assert artifact.space == space\n    # the below check doesn't work: another worker might have associated another storage location with the space, and then the artifact ends up in that\n    # assert artifact.storage == storage_loc\n    # hence this check\n    assert artifact.storage in ln.Storage.filter(space=space)\n    assert ln.context.transform.space == space\n    assert ln.context.run.space == space\n\n    # move the artifact to another storage location\n    space_test_move = ln.Space.get(name=\"test-move\")\n    original_path = artifact.path\n    artifact.space = space_test_move\n    # cancel save\n    with patch(\"builtins.input\", return_value=\"x\"):\n        artifact.save()\n    # save to the new storage location\n    with patch(\"builtins.input\", return_value=\"1\"):\n        artifact.save()\n    assert artifact.space == space_test_move\n    assert artifact.storage in ln.Storage.filter(space=space_test_move)\n    assert not original_path.exists()\n    assert artifact.path.as_posix().startswith(artifact.storage.root)\n    assert artifact.path.exists()\n\n    # move the directory artifact to another storage location\n    assert artifact_dir.space == space\n    assert artifact_dir.path.is_dir()\n    assert artifact_dir.storage in ln.Storage.filter(space=space)\n    original_path_dir = artifact_dir.path\n\n    artifact_dir.space = space_test_move\n    # save to the new storage location\n    with patch(\"builtins.input\", return_value=\"0\"):\n        artifact_dir.save()\n    assert artifact_dir.space == space_test_move\n    assert artifact_dir.storage in ln.Storage.filter(space=space_test_move)\n    original_path_dir.fs.invalidate_cache()\n    assert not original_path_dir.exists()\n    assert artifact_dir.path.as_posix().startswith(artifact_dir.storage.root)\n    assert artifact_dir.path.is_dir()\n\n    # update the space of the storage location\n    space2 = ln.Space.get(name=\"Our test space for CI 2\")\n    storage_loc.space = space2\n    storage_loc.save()\n\n    response_storage = select_storage(lnid=storage_loc.uid)\n    response_space = select_space(lnid=space2.uid)\n    assert response_storage[\"space_id\"] == response_space[\"id\"]\n\n    # connect to the instance before saving\n    subprocess.run(  # noqa: S602\n        \"lamin connect laminlabs/lamin-dev\",\n        shell=True,\n        check=True,\n    )\n    result = subprocess.run(  # noqa: S602\n        \"lamin save .gitignore --key mytest --space 'Our test space for CI 2'\",\n        shell=True,\n        capture_output=True,\n    )\n    assert \"key='mytest'\" in result.stdout.decode()\n    assert \"storage path:\" in result.stdout.decode()\n    assert result.returncode == 0\n\nfinally:\n    try:\n        storage_loc.run = None\n        storage_loc.save()\n    except:  # noqa\n        pass\n    cleanup(\n        (\n            ulabel,\n            artifact,\n            artifact_dir,\n            ln.context.transform.latest_run,\n            ln.context.transform,\n            storage_loc,\n        )\n    )\n"
  },
  {
    "path": "tests/permissions/scripts/setup_access.py",
    "content": "import lamindb as ln  # noqa\nimport hubmodule\nimport hubmodule.models as hm\nfrom uuid import uuid4\nfrom hubmodule.dev.migrate.deploy import _apply_migrations_with_tracking\nfrom hubmodule.dev.setup.install import (\n    _setup_extensions,\n    _setup_secret,\n    _setup_utils_db_modules,\n)\nfrom hubmodule.sql_generators._rls import RLSGenerator\nfrom hubmodule.sql_generators._dbwrite import install_dbwrite\nfrom laminhub_instancedb.postgres import DbRoleHandler\nfrom pathlib import Path\n\n\n# create a db connection url that works with RLS\ninstance_id = ln.setup.settings.instance._id\n\n\ndef create_jwt_user(dsn_admin: str, jwt_role_name: str):\n    db_role_handler = DbRoleHandler(dsn_admin)\n    jwt_db_url = db_role_handler.create(\n        jwt_role_name, expires_in=None, alter_if_exists=True\n    )\n    db_role_handler.permission.grant_write_jwt(jwt_role_name)\n    return jwt_db_url\n\n\npgurl = \"postgresql://postgres:pwd@0.0.0.0:5432/pgtest\"  # admin db connection url\njwt_role_name = f\"{instance_id.hex}_jwt\"\njwt_db_url = create_jwt_user(pgurl, jwt_role_name=jwt_role_name)\n\n_setup_extensions(pgurl)\n_setup_secret(pgurl)\n_setup_utils_db_modules(pgurl)\nmigrations_sql_dir = Path(hubmodule.__file__).parent / \"sql/0004_migrations\"\n_apply_migrations_with_tracking(pgurl, migrations_sql_dir)\n\nrls_generator = RLSGenerator(pgurl, jwt_role_name=jwt_role_name, public_role_name=None)\n\nfor i, table in enumerate(rls_generator._list_tables()):\n    print(i, table.table_name, table.foreign_keys, table.has_space_id)\n\nrls_generator.setup()\n\nprint(\"Created jwt db connection\")\n\ninstall_dbwrite(pgurl)\n\nprint(\"Installed dbwrite\")\n\n# create models\n\nfull_access = ln.Space(name=\"full access\").save()  # type: ignore\nselect_access = ln.Space(name=\"select access\").save()  # type: ignore\nno_access = ln.Space(name=\"no access\").save()  # type: ignore\n# set read role for the default space\nusettings = ln.setup.settings.user\naccount = hm.Account(id=usettings._uuid.hex, uid=usettings.uid, role=\"read\").save()\n\n# create a test user object\nln.User(uid=\"testuid1\", handle=\"testuser\", name=\"Test User\").save()\n\n# no access space\nulabel = ln.ULabel(name=\"no_access_ulabel\")\nulabel.space = no_access\nulabel.save()\n# set up access to this individual record with a dummy role,\n# will work only after the role is changed to read, write or admin\nhm.AccessRecord(\n    account=account, record_type=\"lamindb_ulabel\", record_id=ulabel.id, role=\"dummy\"\n).save()\n\nproject = ln.Project(name=\"No_access_project\")  # type: ignore\nproject.space = no_access\nproject.save()\n\nhm.AccessRecord(\n    account=account, record_type=\"lamindb_project\", record_id=project.id, role=\"dummy\"\n).save()\n\n# setup write access space\nhm.AccessSpace(account=account, space=full_access, role=\"write\").save()\n\nulabel = ln.ULabel(name=\"full_access_ulabel\")\nulabel.space = full_access\nulabel.save()\n# setup read access space\nhm.AccessSpace(account=account, space=select_access, role=\"read\").save()\n\nulabel = ln.ULabel(name=\"select_ulabel\")\nulabel.space = select_access\nulabel.save()\n# artificial but better to test\n# create a link table referencing rows in different spaces\nulabel.projects.add(project)\n\n# default space, only select access by default\nulabel = ln.ULabel(name=\"default_space_ulabel\").save()\nulabel.projects.add(project)\n\nproject = ln.Project(name=\"default_space_project\").save()\nulabel.projects.add(project)\n\n# create a link table referencing ulabel from the default space and project from select space\nproject = ln.Project(name=\"select_project\")\nproject.space = select_access\nproject.save()\n\nulabel.projects.add(project)\n\n# setup team and relevent models\nteam_access = ln.Space(name=\"team access\").save()  # type: ignore\nteam = hm.Team(id=uuid4().hex, uid=\"teamuiduid11\", name=\"test_team\", role=\"read\").save()\nhm.AccountTeam(account=account, team=team).save()\nhm.AccessSpace(team=team, space=team_access, role=\"read\").save()\n\nfeature = ln.Feature(name=\"team_access_feature\", dtype=float)\nfeature.space = team_access\nfeature.save()\n\n# artifact for testing tracking error and artifactblock\nartifact = ln.Artifact(\"README.md\", description=\"test tracking error\")\nartifact.space = select_access\nartifact.save()\n\n# artifact for testing tracking error and locking\nartifact = ln.Artifact(\".gitignore\", description=\"test locking\")\nartifact.space = full_access\nartifact.is_locked = True\nartifact.save()\n\n# create a single record in the default space\nrecord = ln.Record(name=\"test-record\", is_type=False).save()\nassert record.space_id == 1\n\nprint(\"Created models\")\n\n# save jwt db connection\n\nln.setup.settings.instance._db = jwt_db_url\nln.setup.settings.instance._persist()\n"
  },
  {
    "path": "tests/permissions/scripts/setup_instance.py",
    "content": "import lamindb_setup as ln_setup\nfrom laminci.db import setup_local_test_postgres\n\npgurl = setup_local_test_postgres()\n\nln_setup.init(\n    storage=\"./default_storage_permissions\",\n    name=\"lamindb-test-permissions\",\n    db=pgurl,\n)\n\n# can't add this app in the init because don't want t trigger the initial migration\n# that conflicts with _install_db_module\nln_setup.settings.instance._schema_str = \"hubmodule\"\nln_setup.settings.instance._persist()\n"
  },
  {
    "path": "tests/permissions/test_rls_dbwritelog.py",
    "content": "import subprocess\nimport time\nfrom pathlib import Path\nfrom uuid import uuid4\n\nimport hubmodule.models as hm\nimport lamindb as ln\nimport psycopg2\nimport pytest\nfrom django.db import connection, transaction\nfrom django.db.utils import IntegrityError, InternalError, ProgrammingError\nfrom hubmodule.sql_generators._dbwrite import uninstall_dbwrite\nfrom jwt_utils import sign_jwt\nfrom lamindb.models.artifact import track_run_input\nfrom lamindb_setup.core.django import DBToken, db_token_manager\nfrom psycopg2.extensions import adapt\n\npgurl = \"postgresql://postgres:pwd@0.0.0.0:5432/pgtest\"  # admin db connection url\n\nuser_uuid = ln.setup.settings.user._uuid.hex\nexpiration = time.time() + 2000\n# full collaborator token\ntoken = sign_jwt(\n    pgurl, {\"account_id\": user_uuid, \"exp\": expiration, \"type\": \"collaborator\"}\n)\n# read-only token\ntoken_read = sign_jwt(\n    pgurl, {\"account_id\": user_uuid, \"exp\": expiration, \"type\": \"read-only\"}\n)\n# init an instance of DBToken manually\ndb_token = DBToken({})\ndb_token._token = token\ndb_token._token_query = f\"SELECT set_token({adapt(token).getquoted().decode()}, true);\"\ndb_token._expiration = expiration\n\ndb_token_manager.set(db_token)\n\n\ndef test_token_expiration():\n    # init connection.connection\n    with connection.cursor() as cur:\n        pass\n\n    expired_token = sign_jwt(\n        pgurl,\n        {\"account_id\": user_uuid, \"exp\": time.time() - 1000, \"type\": \"collaborator\"},\n    )\n    # check that an expired token is invalid\n    with (\n        pytest.raises(psycopg2.errors.RaiseException),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\"SELECT set_token(%s);\", (expired_token,))\n\n\ndef test_authentication():\n    # just check that the token was setup\n    with connection.cursor() as cur:\n        cur.execute(\n            \"SELECT 1 in (SELECT id FROM public.check_access() WHERE role = 'read');\"\n        )\n        result = cur.fetchall()[0][0]\n    assert result\n    # check querying without setting jwt\n    with (\n        pytest.raises(psycopg2.errors.RaiseException),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\"SELECT * FROM lamindb_ulabel;\")\n    # test that auth can't be hijacked\n    # false table created before\n    with (\n        pytest.raises(psycopg2.errors.DuplicateTable),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\n            \"\"\"\n            CREATE TEMP TABLE access(\n                id int,\n                role varchar(20),\n                type text\n            ) ON COMMIT DROP;\n            SELECT set_token(%s);\n            \"\"\",\n            (token,),\n        )\n    # check that jwt user can't set arbitrary account_id manually\n    with (\n        pytest.raises(psycopg2.errors.RaiseException),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\n            \"\"\"\n            CREATE TEMP TABLE access(\n                id int,\n                role varchar(20),\n                type text\n            ) ON COMMIT DROP;\n            INSERT INTO access (id, role, type)\n            VALUES (1, 'admin', 'space');\n            SELECT * FROM check_access();\n            \"\"\"\n        )\n    # check manual insert\n    with (\n        pytest.raises(psycopg2.errors.InsufficientPrivilege),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\n            \"\"\"\n            SELECT set_token(%s);\n            INSERT INTO access (id, role, type)\n            VALUES (1, 'admin', 'space');\n            \"\"\",\n            (token,),\n        )\n    # test access to the security schema\n    with (\n        pytest.raises(psycopg2.errors.InsufficientPrivilege),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\"SELECT security.get_secret('jwt_secret');\")\n    # test read-only token\n    with connection.connection.cursor() as cur:\n        cur.execute(\"SELECT set_token(%s); SELECT * FROM check_access()\", (token_read,))\n        result = cur.fetchall()\n    assert len(result) == 1\n    assert result[0] == (1, \"read\", \"space\")\n\n    assert ln.base.users._user_has_write_access()\n\n\ndef test_select_without_db_token():\n    # with db token can be read in the default space\n    with connection.cursor() as cur:\n        cur.execute(\"SELECT * FROM lamindb_record;\")\n        results = cur.fetchall()\n    assert len(results) == 1\n    # the same\n    assert ln.Record.filter().count() == 1\n    # errors if can't select\n    ln.Record.get(1)\n    # no db token, everything in the default space\n    with (\n        pytest.raises(psycopg2.errors.RaiseException),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\"SELECT * FROM lamindb_record;\")\n    with (\n        pytest.raises(psycopg2.errors.RaiseException),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\"SELECT * FROM lamindb_record WHERE id = 1;\")\n    # no db token, in different spaces\n    with (\n        pytest.raises(psycopg2.errors.RaiseException),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\"SELECT * FROM lamindb_artifact;\")\n    with (\n        pytest.raises(psycopg2.errors.RaiseException),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\"SELECT * FROM lamindb_ulabel;\")\n    # no db token, utility tables\n    with (\n        pytest.raises(psycopg2.errors.RaiseException),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\"SELECT * FROM lamindb_user;\")\n    with (\n        pytest.raises(psycopg2.errors.RaiseException),\n        connection.connection.cursor() as cur,\n    ):\n        cur.execute(\"SELECT * FROM lamindb_space;\")\n\n\ndef test_fine_grained_permissions_account_and_dbwrite():\n    # check select\n    assert ln.ULabel.filter().count() == 3\n    assert ln.Project.filter().count() == 2\n\n    ulabel = ln.ULabel.get(name=\"default_space_ulabel\")\n    assert ulabel.projects.all().count() == 2\n    # check delete\n    # should delete\n    ulabel_del = ln.ULabel.get(name=\"full_access_ulabel\")\n    ulabel_del_id = ulabel_del.id\n    ulabel_del.delete(permanent=True)\n    assert ln.ULabel.filter().count() == 2\n    # check the logs for delete\n    log_rec = (\n        hm.DbWrite.filter(sqlrecord_id=ulabel_del_id, table_name=\"lamindb_ulabel\")\n        .order_by(\"-id\")\n        .first()\n    )\n    assert log_rec.event_type == \"DELETE\"\n    assert log_rec.data is not None\n    assert log_rec.created_by_id == 1\n    # check the logs for insert\n    log_rec = (\n        hm.DbWrite.filter(sqlrecord_id=ulabel_del_id, table_name=\"lamindb_ulabel\")\n        .order_by(\"id\")\n        .first()\n    )\n    assert log_rec.event_type == \"INSERT\"\n    assert log_rec.data is None\n    assert log_rec.created_by_id is None  # this was inserted without setting a db token\n    # should not delete, does not error for some reason\n    ln.ULabel.get(name=\"select_ulabel\").delete(permanent=True)\n    assert ln.ULabel.filter().count() == 2\n    # default space\n    ulabel.delete(permanent=True)\n    assert ln.ULabel.filter().count() == 2\n    # check insert\n    # should succeed\n    space = ln.Space.get(name=\"full access\")\n    ulabel = ln.ULabel(name=\"new label\")\n    ulabel.space = space\n    ulabel.save()\n    # should fail\n    with pytest.raises(ln.errors.NoWriteAccess):\n        ln.ULabel(name=\"new label fail\").save()\n    for space_name in [\"select access\", \"no access\"]:\n        space = ln.Space.get(name=space_name)\n        ulabel = ln.ULabel(name=\"new label fail\")\n        ulabel.space = space\n        with pytest.raises(ln.errors.NoWriteAccess):\n            ulabel.save()\n    # check update\n    # should succeed\n    ulabel = ln.ULabel.get(name=\"new label\")\n    ulabel.name = \"new label update\"\n    ulabel.save()\n    ulabel = ln.ULabel.get(name=\"new label update\")  # check that it is saved\n    # check the logs for update\n    log_rec = (\n        hm.DbWrite.filter(sqlrecord_id=ulabel.id, table_name=\"lamindb_ulabel\")\n        .order_by(\"-id\")\n        .first()\n    )\n    assert log_rec.event_type == \"UPDATE\"\n    assert log_rec.data[\"name\"] == \"new label\"  # changed\n    assert \"id\" not in log_rec.data  # didn't change\n    assert log_rec.created_by_id == 1\n    # should fail\n    ulabel = ln.ULabel.get(name=\"select_ulabel\")\n    ulabel.name = \"select_ulabel update\"\n    with pytest.raises(ln.errors.NoWriteAccess):\n        ulabel.save()\n    # default space\n    ulabel = ln.ULabel.get(name=\"default_space_ulabel\")\n    ulabel.name = \"default_space_ulabel update\"\n    with pytest.raises(ln.errors.NoWriteAccess):\n        ulabel.save()\n    # check link tables\n    # check insert\n    project = ln.Project(name=\"Myproject\")\n    project.space = ln.Space.get(name=\"full access\")\n    project.save()\n    ulabel = ln.ULabel.get(name=\"new label update\")\n    ulabel.projects.add(project)\n    assert ulabel.projects.all().count() == 1\n    # check select of a link table referencing unavailable rows\n    assert ln.ULabel.get(name=\"select_ulabel\").projects.all().count() == 0\n    # test SpaceBlock\n    space = ln.Space.get(name=\"select access\")\n    with pytest.raises(ln.errors.NoWriteAccess):\n        ln.models.SpaceBlock(space=space, content=\"test\", kind=\"readme\").save()\n    # test ArtifactBlock, artifact is read-only\n    artifact = ln.Artifact.get(description=\"test tracking error\")\n    with pytest.raises(ProgrammingError):\n        ln.models.ArtifactBlock(artifact=artifact, content=\"test\", kind=\"readme\").save()\n    # test BranchBlock, the account is read-only\n    branch = ln.Branch.get(1)  # main branch in all space\n    with pytest.raises(ProgrammingError):\n        ln.models.BranchBlock(branch=branch, content=\"test\", kind=\"readme\").save()\n\n\ndef test_fine_grained_permissions_team():\n    assert ln.Feature.filter().count() == 1\n    ln.Feature.get(name=\"team_access_feature\")\n\n\ndef test_fine_grained_permissions_single_records():\n    assert not ln.ULabel.filter(name=\"no_access_ulabel\").exists()\n    assert not ln.Project.filter(name=\"No_access_project\").exists()\n\n    # check that the logs are not available for the ulabel\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(\"SELECT id FROM lamindb_ulabel WHERE name = 'no_access_ulabel'\")\n        ulabel_id = cur.fetchone()[0]\n    assert not hm.DbWrite.filter(\n        sqlrecord_id=ulabel_id, table_name=\"lamindb_ulabel\"\n    ).exists()\n\n    # switch access to this ulabel to read\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(\n            \"\"\"\n            UPDATE hubmodule_accessrecord SET role = 'read'\n            WHERE account_id = %s AND record_type = 'lamindb_ulabel'\n            \"\"\",\n            (user_uuid,),\n        )\n\n    ulabel = ln.ULabel.get(name=\"no_access_ulabel\")\n\n    # check that the logs are available now\n    assert hm.DbWrite.filter(\n        sqlrecord_id=ulabel.id, table_name=\"lamindb_ulabel\"\n    ).exists()\n\n    new_name = \"new_name_single_rls_access_ulabel\"\n    ulabel.name = new_name\n    with pytest.raises(ln.errors.NoWriteAccess):\n        ulabel.save()\n\n    # switch access for the project to read\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(\n            \"\"\"\n            UPDATE hubmodule_accessrecord SET role = 'read'\n            WHERE account_id = %s AND record_type = 'lamindb_project'\n            \"\"\",\n            (user_uuid,),\n        )\n    # now the project is readable\n    project = ln.Project.get(name=\"No_access_project\")\n\n    # can't insert into lamindb_ulabelproject because the ulabel is read-only\n    with pytest.raises(ProgrammingError):\n        ulabel.projects.add(project)\n\n    # switch access for the ulabel to write\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(\n            \"\"\"\n            UPDATE hubmodule_accessrecord SET role = 'write'\n            WHERE account_id = %s AND record_type = 'lamindb_ulabel'\n            \"\"\",\n            (user_uuid,),\n        )\n\n    ulabel.save()\n\n    # can insert into lamindb_ulabelproject because the ulabel is now write-able\n    # and the project is read-only, but this doesn't matter as the principal key is ulabel\n    ulabel.projects.add(project)\n    assert ulabel.projects.count() == 1\n\n    ulabel.delete(permanent=True)\n    assert not ln.ULabel.filter(name=\"no_access_ulabel\").exists()\n\n\n# tests that token is set properly in atomic blocks\ndef test_atomic():\n    with transaction.atomic():\n        assert ln.Feature.filter().count() == 1\n        # test with nested\n        with transaction.atomic():\n            assert ln.Feature.filter().count() == 1\n\n            feature = ln.Feature(name=\"atomic_feature\", dtype=float)\n            feature.space = ln.Space.get(name=\"full access\")\n            feature.save()\n\n    assert ln.Feature.filter().count() == 2\n\n\ndef test_utility_tables():\n    # can select in these tables\n    assert ln.Space.filter().count() == 5\n    # can't select\n    assert hm.Account.filter().count() == 0\n    assert hm.Team.filter().count() == 0\n    assert hm.AccountTeam.filter().count() == 0\n    assert hm.AccessSpace.filter().count() == 0\n    assert hm.AccessRecord.filter().count() == 0\n    # can't update a space\n    space = ln.Space.get(id=1)  # default space\n    space.name = \"new name\"\n    with pytest.raises(ProgrammingError):\n        space.save()\n    with pytest.raises(ProgrammingError):\n        ln.Space(name=\"new space\").save()\n    # can't insert\n    with pytest.raises(ProgrammingError):\n        hm.Account(id=uuid4().hex, uid=\"accntid2\", role=\"admin\").save()\n\n\ndef test_user_rls():\n    assert ln.User.filter().count() == 2\n    # should fail because can modify only the current user\n    user = ln.User.get(handle=\"testuser\")\n    user.name = \"New Name\"\n    with pytest.raises(ProgrammingError):\n        user.save()\n    # can't insert a user with a different uid\n    with pytest.raises(ProgrammingError):\n        ln.User(handle=\"insert_new_user\", uid=\"someuidd\").save()\n    # also triggers RLS\n    with pytest.raises(ProgrammingError):\n        ln.User(handle=\"insert_new_user\", uid=user.uid).save()\n    # try to insert a user with the same uid\n    # should not trigger RLS because the uid is the same, it should throw an IntegrityError\n    with pytest.raises(IntegrityError):\n        ln.User(handle=\"insert_new_user\", uid=ln.setup.settings.user.uid).save()\n    # can modify the current user\n    user = ln.User.get(1)\n    user.name = \"New Name\"\n    user.save()\n\n\ndef test_write_role():\n    # switch user role to write\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(\n            \"UPDATE hubmodule_account SET role = 'write' WHERE id = %s\", (user_uuid,)\n        )\n\n    ln.ULabel(name=\"new label account default space\").save()\n\n    # switch user role back to read and team role to write\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(\n            \"UPDATE hubmodule_account SET role = 'read' WHERE id = %s\", (user_uuid,)\n        )\n        cur.execute(\n            \"UPDATE hubmodule_team SET role = 'write' WHERE uid = 'teamuiduid11'\",\n        )\n\n    ln.ULabel(name=\"new label team default space\").save()\n\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(\n            \"UPDATE hubmodule_team SET role = 'read' WHERE uid = 'teamuiduid11'\",\n        )\n\n\ndef test_locking():\n    artifact = ln.Artifact.get(description=\"test locking\")\n    artifact.description = \"new description\"\n    with pytest.raises(ln.errors.NoWriteAccess) as e:\n        artifact.save()\n    assert \"It is not allowed to modify or create locked\" in str(e)\n\n\ndef test_tracking_error():\n    # switch user role to write to create the transform and run\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(\n            \"UPDATE hubmodule_account SET role = 'write' WHERE id = %s\", (user_uuid,)\n        )\n\n    artifact = ln.Artifact.get(description=\"test tracking error\")\n\n    transform = ln.Transform(key=\"My transform\").save()\n    run = ln.Run(transform).save()\n\n    # this error because ln.setup.settings.instance._db_permissions is not jwt\n    # it is None\n    with pytest.raises(ln.errors.NoWriteAccess) as e:\n        track_run_input(artifact, run)\n    assert \"You’re not allowed to write to the instance \" in str(e)\n\n    # the instance is local so we set this manually\n    ln.setup.settings.instance._db_permissions = \"jwt\"\n    # artifact.space is not available for writes\n    with pytest.raises(ln.errors.NoWriteAccess) as e:\n        track_run_input(artifact, run)\n    assert \"You’re not allowed to write to the space \" in str(e)\n\n    # this artifact is locked\n    artifact = ln.Artifact.get(description=\"test locking\")\n    with pytest.raises(ln.errors.NoWriteAccess) as e:\n        track_run_input(artifact, run)\n    assert \"It is not allowed to modify locked records\" in str(e)\n\n    # switch user role back to read\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(\n            \"UPDATE hubmodule_account SET role = 'read' WHERE id = %s\", (user_uuid,)\n        )\n    # as the user is read-only now, 2 spaces are unavailable for writes (artifact.space, run.space)\n    artifact = ln.Artifact.get(description=\"test tracking error\")\n    with pytest.raises(ln.errors.NoWriteAccess) as e:\n        track_run_input(artifact, run)\n    assert \"You’re not allowed to write to the spaces \" in str(e)\n\n    ln.setup.settings.instance._db_permissions = None\n\n\ndef test_token_reset():\n    db_token_manager.reset()\n\n    # account_id is not set\n    with pytest.raises(InternalError) as error:\n        ln.ULabel.filter().count()\n    assert \"JWT is not set\" in error.exconly()\n\n    with pytest.raises(InternalError) as error, transaction.atomic():\n        ln.ULabel.filter().count()\n    assert \"JWT is not set\" in error.exconly()\n\n\ndef test_dbwrite_uninstall():\n    triggers_exist_query = (\n        \"SELECT EXISTS (SELECT 1 FROM pg_trigger WHERE tgname LIKE 'dbwrite_%')\"\n    )\n    table_exists_query = \"SELECT to_regclass('public.hubmodule_dbwrite') IS NOT NULL\"\n\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(triggers_exist_query)\n        triggers_exist = cur.fetchone()[0]\n    assert triggers_exist\n\n    uninstall_dbwrite(pgurl, drop_table=False)\n\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(triggers_exist_query)\n        triggers_exist = cur.fetchone()[0]\n        assert not triggers_exist\n\n        cur.execute(table_exists_query)\n        table_exists = cur.fetchone()[0]\n        assert table_exists\n\n    uninstall_dbwrite(pgurl, drop_table=True)\n\n    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:\n        cur.execute(table_exists_query)\n        table_exists = cur.fetchone()[0]\n    assert not table_exists\n\n\ndef test_lamin_dev():\n    script_path = Path(__file__).parent.resolve() / \"scripts/check_lamin_dev.py\"\n    subprocess.run(  # noqa: S602\n        f\"python {script_path}\",\n        shell=True,\n        check=True,\n    )\n"
  },
  {
    "path": "tests/profiling/import_lamindb.py",
    "content": "import lamindb as ln  # noqa: F401\n"
  },
  {
    "path": "tests/profiling/import_lamindb_and_connect.py",
    "content": "import lamindb as ln\n\n# should connect to another instance than laminlabs/lamindata\n# because the former is used to log the test run\nln.connect(\"laminlabs/lamin-site-assets\")\n"
  },
  {
    "path": "tests/profiling/import_lamindb_core_storage.py",
    "content": "import lamindb.core.storage  # noqa: F401\n"
  },
  {
    "path": "tests/profiling/import_records_from_dataframe.py",
    "content": "import argparse\nfrom datetime import datetime\nfrom random import Random\nfrom time import perf_counter\n\nimport lamindb as ln\nimport pandas as pd\n\n\ndef generate_values(dtype: str, n_rows: int, rng: Random):\n    cell_types = [\n        \"T cell\",\n        \"B cell\",\n        \"natural killer cell\",\n        \"monocyte\",\n        \"epithelial cell\",\n    ]\n    if dtype in {\"float\", \"num\"}:\n        return [round(rng.uniform(0.0, 100.0), 3) for _ in range(n_rows)]\n    if dtype.startswith(\"cat[\"):\n        return [rng.choice(cell_types) for _ in range(n_rows)]\n    raise ValueError(f\"Unsupported dtype: {dtype}\")\n\n\n@ln.flow(\"JuJZZEsit1KV\")\ndef main(n_rows: int):\n    feature_names = [\n        \"age_or_mean_of_age_range\",\n        \"array_col\",\n        \"cell_type_by_model\",\n    ]\n    rng = Random(0)\n    features = ln.Feature.filter(name__in=feature_names)\n    dtypes_by_feature = {feature.name: feature.dtype_as_str for feature in features}\n\n    data: dict[str, list] = {}\n    print(\"Generating random dataframe values...\")\n    for feature in features:\n        data[feature.name] = generate_values(\n            dtypes_by_feature[feature.name], n_rows, rng\n        )\n    df = pd.DataFrame(data)\n    print(df.head(5))\n\n    print(\"Running Record.from_dataframe()...\")\n    from_dataframe_start = perf_counter()\n    records = ln.Record.from_dataframe(\n        df,\n        type=f\"test-import-records-from-dataframe-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}\",\n    )\n    from_dataframe_duration_sec = perf_counter() - from_dataframe_start\n    print(f\"... completed in {from_dataframe_duration_sec:.6f}s\")\n\n    print(\"Saving records...\")\n    save_start = perf_counter()\n    records.save()\n    save_duration_sec = perf_counter() - save_start\n    print(f\"... completed in {save_duration_sec:.6f}s\")\n\n    run = ln.context.run\n    params = run.params or {}\n    params.update(\n        {\n            \"from_dataframe_duration_sec\": round(from_dataframe_duration_sec, 6),\n            \"save_duration_sec\": round(save_duration_sec, 6),\n        }\n    )\n    run.params = params\n    run.save()\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(\n        description=\"Prepare and optionally save test Records rows via Record.from_dataframe().\"\n    )\n    parser.add_argument(\"--rows\", type=int, default=100)\n    args = parser.parse_args()\n    ln.connect(\"laminlabs/lamindata\")\n    main(n_rows=args.rows)\n"
  },
  {
    "path": "tests/storage/conftest.py",
    "content": "import shutil\nfrom pathlib import Path\nfrom subprocess import DEVNULL, run\nfrom time import perf_counter\n\nimport lamindb as ln\nimport lamindb_setup as ln_setup\nimport pytest\nfrom lamin_utils import logger\nfrom laminci.db import setup_local_test_postgres\n\n\ndef create_test_instance(pgurl: str):\n    ln_setup.init(\n        storage=\"./default_storage_unit_storage\",\n        modules=\"bionty\",\n        name=\"lamindb-unit-tests-storage\",\n        db=pgurl,\n    )\n    ln_setup.register()  # temporarily\n\n    ln.settings.creation.artifact_silence_missing_run_warning = True\n    ln.settings.track_run_inputs = False\n    ln.Storage(\"s3://lamindb-ci/test-data\").save()\n    ln.Storage(\"s3://lamindb-test/core\").save()\n    ln.Storage(\"s3://lamindb-test/storage\").save()\n\n\ndef pytest_sessionstart():\n    t_execute_start = perf_counter()\n\n    ln_setup._TESTING = True\n    try:\n        pgurl = setup_local_test_postgres()\n    except RuntimeError:\n        run(\"docker stop pgtest && docker rm pgtest\", shell=True, stdout=DEVNULL)  # noqa: S602\n        pgurl = setup_local_test_postgres()\n    try:\n        create_test_instance(pgurl)\n    except Exception as e:\n        print(\"failed to create test instance:\", e)\n        print(\"deleting the instance\")\n        delete_test_instance()\n        # below currently fails because cannot create two instances in the same session\n        # create_test_instance(pgurl)\n        print(\"now rerun\")\n        quit()\n    total_time_elapsed = perf_counter() - t_execute_start\n    print(f\"time to setup the instance: {total_time_elapsed:.1f}s\")\n    assert ln.Storage.filter(root=\"s3://lamindb-ci/test-data\").one_or_none() is not None\n\n\ndef delete_test_instance():\n    logger.set_verbosity(1)\n    if Path(\"./default_storage_unit_storage\").exists():\n        shutil.rmtree(\"./default_storage_unit_storage\")\n    # handle below better in the future\n    for path in (\n        \"s3://lamindb-test/storage/.lamindb\",\n        \"s3://lamindb-test/core/.lamindb\",\n        \"s3://lamindb-ci/lamindb-unit-tests-cloud/.lamindb\",\n        \"s3://lamindb-ci/test-settings-switch-storage/.lamindb\",\n    ):\n        upath = ln_setup.core.upath.UPath(path)\n        if upath.exists():\n            upath.rmdir()\n    ln_setup.delete(\"lamindb-unit-tests-storage\", force=True)\n\n\ndef pytest_sessionfinish(session: pytest.Session):\n    delete_test_instance()\n    run(\"docker stop pgtest && docker rm pgtest\", shell=True, stdout=DEVNULL)  # noqa: S602\n\n\n@pytest.fixture\ndef ccaplog(caplog):\n    \"\"\"Add caplog handler to our custom logger at session start.\"\"\"\n    from lamin_utils._logger import logger\n\n    # Add caplog's handler to our custom logger\n    logger.addHandler(caplog.handler)\n\n    yield caplog\n\n    # Clean up at the end of the session\n    logger.removeHandler(caplog.handler)\n"
  },
  {
    "path": "tests/storage/test_artifact_storage.py",
    "content": "import shutil\n\nimport anndata as ad\nimport lamindb as ln\nimport pytest\nfrom lamindb.errors import (\n    IntegrityError,\n)\n\n\ndef test_create_from_anndata_in_existing_cloud_storage():\n    filepath = \"s3://lamindb-test/core/scrnaseq_pbmc68k_tiny.h5ad\"\n    artifact = ln.Artifact.from_anndata(\n        filepath, description=\"test_create_from_anndata_cloudpath\"\n    )\n    assert artifact.n_observations == 70\n    artifact.save()\n    assert ln.Artifact.get(path=artifact.path) == artifact\n    # check that the local filepath has been cleared\n    assert not hasattr(artifact, \"_local_filepath\")\n    assert artifact.path.as_posix().startswith(\"s3://lamindb-test/core\")\n\n\n@pytest.mark.parametrize(\n    \"filepath_str\",\n    [\"s3://lamindb-ci/test-data/test.parquet\", \"s3://lamindb-ci/test-data/test.csv\"],\n)\n@pytest.mark.parametrize(\"skip_check_exists\", [False, True])\n@pytest.mark.parametrize(\"skip_size_and_hash\", [False, True])\ndef test_create_small_file_from_remote_path(\n    filepath_str, skip_check_exists, skip_size_and_hash\n):\n    ln.settings.creation.artifact_skip_size_hash = skip_size_and_hash\n    artifact = ln.Artifact(\n        filepath_str,\n        skip_check_exists=skip_check_exists,\n    )\n    artifact.save()\n    # test cache()\n    file_from_local = ln.Artifact(artifact.cache(), description=\"test\")\n    # test hash equivalency when computed on local machine\n    if not skip_size_and_hash:\n        assert file_from_local.hash == artifact.hash\n        assert file_from_local._hash_type == \"md5\"\n        assert artifact._hash_type == \"md5\"\n    assert artifact.path.as_posix() == filepath_str\n    assert artifact.load().iloc[0].tolist() == [\n        0,\n        \"Abingdon island giant tortoise\",\n        \"Chelonoidis abingdonii\",\n        106734,\n        \"ASM359739v1\",\n        \"GCA_003597395.1\",\n        \"Full genebuild\",\n        \"-\",\n        \"-\",\n    ]\n    artifact.delete(permanent=True, storage=False)\n    ln.settings.creation.artifact_skip_size_hash = False\n\n\ndef test_versioning_arifact_from_existing_path(ccaplog):\n    artifact1 = ln.Artifact(\"s3://lamindb-ci/test-data/test.parquet\").save()\n    artifact2 = ln.Artifact(\n        \"s3://lamindb-ci/test-data/test.csv\", revises=artifact1\n    ).save()\n    assert \"you are saving to a non-latest version of the artifact\" not in ccaplog.text\n    assert artifact1.stem_uid == artifact2.stem_uid\n    assert artifact1.uid != artifact2.uid\n    artifact1.delete(permanent=True, storage=False)\n    artifact2.delete(permanent=True, storage=False)\n\n\ndef test_create_big_file_from_remote_path():\n    # the point of this test is check the multi-upload hash\n    filepath_str = \"s3://lamindb-test/core/human_immune.h5ad\"\n    # we don't use from_anndata() here because we test this with a small file for shorter run time\n    artifact = ln.Artifact(filepath_str)\n    assert not artifact._key_is_virtual\n    assert artifact._real_key is None\n    assert artifact.key == \"human_immune.h5ad\"\n    assert artifact._hash_type == \"md5-3\"\n    assert artifact.size == 21960324\n    assert artifact.path.as_posix() == filepath_str\n    # check _real_key\n    artifact = ln.Artifact(filepath_str, key=\"adata_test_key.h5ad\")\n    assert artifact._key_is_virtual\n    assert artifact.key == \"adata_test_key.h5ad\"\n    assert artifact._real_key.endswith(\"human_immune.h5ad\")\n    assert artifact.path.as_posix() == filepath_str\n\n\ndef test_delete_artifact_from_non_managed_storage():\n    artifact = ln.Artifact(\n        \"s3://lamindb-dev-datasets/file-to-test-for-delete.csv\",\n        description=\"My test file to delete from non-default storage\",\n    ).save()\n    assert artifact.storage.instance_uid != ln.setup.settings.instance.uid\n    assert artifact.key is not None\n    filepath = artifact.path\n    with pytest.raises(IntegrityError) as e:\n        artifact.delete()\n    assert e.exconly().startswith(\n        \"lamindb.errors.IntegrityError: Cannot simply delete artifacts\"\n    )\n    artifact.delete(storage=False, permanent=True)\n    assert (\n        ln.Artifact.filter(\n            description=\"My test file to delete from non-default storage\",\n            branch_id=None,\n        ).first()\n        is None\n    )\n    assert filepath.exists()\n\n\ndef test_huggingface_paths():\n    artifact_adata = ln.Artifact(\n        \"hf://datasets/Koncopd/lamindb-test@main/anndata/pbmc68k_test.h5ad\",\n        description=\"hf adata\",\n    )\n    artifact_adata.save()\n    assert artifact_adata.key == \"anndata/pbmc68k_test.h5ad\"\n    assert artifact_adata.hash is not None\n    assert isinstance(artifact_adata.load(), ad.AnnData)\n    assert artifact_adata._cache_path.exists()\n    artifact_adata._cache_path.unlink()\n\n    artifact_pq = ln.Artifact(\n        \"hf://datasets/Koncopd/lamindb-test/sharded_parquet\", description=\"hf parquet\"\n    )\n    artifact_pq.save()\n    assert artifact_pq.hash is not None\n    assert len(artifact_pq.open().files) == 11\n    assert artifact_pq.cache().is_dir()\n    shutil.rmtree(artifact_pq._cache_path)\n\n    artifact_adata.delete(permanent=True, storage=False)\n    artifact_pq.delete(permanent=True, storage=False)\n\n\ndef test_gcp_paths():\n    artifact_folder = ln.Artifact(\n        \"gs://rxrx1-europe-west4/images/test/HEPG2-08\", description=\"Test GCP folder\"\n    ).save()\n    assert artifact_folder.hash == \"6r5Hkce0UTy7X6gLeaqzBA\"\n    assert artifact_folder.n_files == 14772\n\n    artifact_file = ln.Artifact(\n        \"gs://rxrx1-europe-west4/images/test/HEPG2-08/Plate1/B02_s1_w1.png\",\n        description=\"Test GCP file\",\n    ).save()\n    assert artifact_file.hash == \"foEgLjmuUHO62CazxN97rA\"\n    cache_path = artifact_file.cache()\n    assert cache_path.is_file()\n\n    cache_path.unlink()\n    artifact_folder.delete(permanent=True, storage=False)\n    artifact_file.delete(permanent=True, storage=False)\n\n\ndef test_http_paths():\n    http_path = ln.UPath(\n        \"https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md\"\n    )\n    artifact_readme = ln.Artifact(http_path, description=\"register http readme\").save()\n    # might change\n    assert artifact_readme.hash is not None\n    cache_path = artifact_readme.cache()\n    assert cache_path.exists()\n    assert cache_path.stat().st_size == http_path.stat().st_size\n    cache_path.unlink()\n    # just check saving for the second time (when Strage record is in the db)\n    artifact_license = ln.Artifact(\n        \"https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/LICENSE\",\n        description=\"register http license\",\n    ).save()\n    assert artifact_license.hash == \"IQxRSNjvb7w2OLFeWqYlsg\"\n\n    artifact_readme.delete(permanent=True, storage=False)\n    artifact_license.delete(permanent=True, storage=False)\n\n\n# also see test in lamindb-setup/tests/storage/test_storage_stats.py\n# there is also a test for GCP there\ndef test_folder_like_artifact_s3():\n    study0_data = ln.Artifact(\"s3://lamindata/iris_studies/study0_raw_images\")\n    assert study0_data.hash == \"IVKGMfNwi8zKvnpaD_gG7w\"\n    assert study0_data._hash_type == \"md5-d\"\n    assert study0_data.n_files == 51\n    assert study0_data.size == 658465\n\n\ndef test_single_file_directory_preserved(tmp_path):\n    local_dir = tmp_path / \"single_file_dir\"\n    local_dir.mkdir()\n    (local_dir / \"only.txt\").write_text(\"single file\")\n\n    storage = ln.Storage.get(root=\"s3://lamindb-test/storage\")\n    artifact = ln.Artifact(\n        local_dir, key=\"tests/single-file-directory\", storage=storage\n    ).save()\n    assert artifact.path.as_posix().startswith(\"s3://lamindb-test/storage\")\n    assert artifact.n_files == 1\n    assert artifact.path.is_dir()\n    assert [file.name for file in artifact.path.iterdir()] == [\"only.txt\"]\n\n    artifact.delete(permanent=True)\n"
  },
  {
    "path": "tests/storage/test_artifact_zarr.py",
    "content": "import shutil\nfrom pathlib import Path\n\nimport anndata as ad\nimport lamindb as ln\nimport numpy as np\nimport pandas as pd\nimport pytest\nfrom lamindb.core.storage._zarr import identify_zarr_type\nfrom lamindb_setup.core.upath import (\n    CloudPath,\n)\n\n\n@pytest.fixture(scope=\"session\")\ndef get_small_adata():\n    return ad.AnnData(\n        X=np.array([[1, 2, 3], [4, 5, 6]]),\n        obs={\"feat1\": [\"A\", \"B\"]},\n        var=pd.DataFrame(index=[\"MYC\", \"TCF7\", \"GATA1\"]),\n        obsm={\"X_pca\": np.array([[1, 2], [3, 4]])},\n    )\n\n\ndef test_zarr_upload_cache(get_small_adata):\n    previous_storage = ln.setup.settings.storage.root_as_str\n    ln.settings.storage = \"s3://lamindb-test/core\"\n\n    zarr_path = Path(\"./test_adata.zarr\")\n    get_small_adata.write_zarr(zarr_path)\n\n    artifact = ln.Artifact(zarr_path, key=\"test_adata.zarr\")\n    assert not artifact._storage_ongoing\n    assert artifact.otype == \"AnnData\"\n    assert artifact.n_files >= 1\n    artifact.save()\n\n    assert ln.Artifact.get(path=artifact.path) == artifact\n\n    assert not artifact._storage_ongoing\n\n    assert isinstance(artifact.path, CloudPath)\n    assert artifact.path.exists()\n    assert identify_zarr_type(artifact.path) == \"anndata\"\n\n    shutil.rmtree(artifact.cache())\n\n    cache_path = artifact._cache_path\n    assert isinstance(artifact.load(), ad.AnnData)\n    assert cache_path.is_dir()\n\n    shutil.rmtree(cache_path)\n    assert not cache_path.exists()\n    artifact.cache()\n    assert cache_path.is_dir()\n\n    artifact.delete(permanent=True, storage=True)\n    shutil.rmtree(zarr_path)\n\n    # test zarr from memory\n    artifact = ln.Artifact(get_small_adata, key=\"test_adata.anndata.zarr\")\n    assert not artifact._storage_ongoing\n    assert artifact._local_filepath.is_dir()\n    assert artifact.otype == \"AnnData\"\n    assert artifact.suffix == \".anndata.zarr\"\n    assert artifact.n_files >= 1\n\n    ln.save([artifact])  # use bulk save here for testing\n    assert not artifact._storage_ongoing\n    assert isinstance(artifact.path, CloudPath)\n    assert artifact.path.exists()\n    cache_path = artifact._cache_path\n    assert cache_path.is_dir()\n\n    shutil.rmtree(cache_path)\n    assert not cache_path.exists()\n\n    artifact._memory_rep = None\n\n    assert isinstance(artifact.load(), ad.AnnData)\n    assert cache_path.is_dir()\n\n    artifact.delete(permanent=True, storage=True)\n\n    ln.settings.storage = previous_storage\n"
  },
  {
    "path": "tests/storage/test_cache.py",
    "content": "import shutil\nfrom pathlib import Path\nfrom time import sleep\n\nimport lamindb as ln\nimport pytest\nfrom lamindb.core.loaders import load_h5ad\nfrom lamindb_setup._set_managed_storage import set_managed_storage\n\n\n# https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test\n# switch to cloud storage and back\n@pytest.fixture\ndef switch_storage():\n    cloud_storage = \"s3://lamindb-ci/lamindb-unit-tests-cloud\"\n    set_managed_storage(cloud_storage)\n    yield cloud_storage\n    set_managed_storage(\"./default_storage_unit_storage\")\n\n\ndef test_local_cache():\n    # check that we have local storage\n    local_storage = Path(\"./default_storage_unit_storage\").resolve().as_posix()\n    assert ln.setup.settings.storage.root_as_str == local_storage\n\n    test_file = ln.examples.datasets.anndata_file_pbmc68k_test()\n    adata = load_h5ad(test_file)\n\n    artifact = ln.Artifact.from_anndata(adata, key=\"test_cache.h5ad\")\n    temp_path = artifact._local_filepath.resolve()\n    assert temp_path.exists()\n    assert ln.setup.settings.cache_dir in temp_path.parents\n\n    artifact.save()\n    assert artifact.path.exists()\n    assert not temp_path.exists()\n\n    artifact.delete(permanent=True)\n\n    # check directories\n    adata_zarr_pth = Path(\"test_adata.zarr\")\n    adata.write_zarr(adata_zarr_pth)\n    assert adata_zarr_pth.exists()\n\n    artifact = ln.Artifact(adata_zarr_pth, key=\"test_cache.zarr\").save()\n    assert adata_zarr_pth.exists()\n    assert artifact.path.exists()\n    assert artifact.path.name != artifact.key\n\n    shutil.rmtree(adata_zarr_pth)\n    artifact.delete(permanent=True)\n\n    # check directories in cache\n    cache_dir = ln.setup.settings.cache_dir\n    adata_zarr_pth = cache_dir / \"test_adata.zarr\"\n    adata.write_zarr(adata_zarr_pth)\n\n    artifact = ln.Artifact(adata_zarr_pth, key=\"test_cache.zarr\")\n    assert adata_zarr_pth.exists()\n    artifact.save()\n\n    assert not adata_zarr_pth.exists()\n    assert artifact.path.exists()\n    assert artifact.path.name != artifact.key\n\n    artifact.delete(permanent=True)\n\n\ndef test_cloud_cache(switch_storage):\n    # check that we have cloud storage\n    assert ln.setup.settings.storage.root_as_str == switch_storage\n\n    cache_dir = ln.setup.settings.cache_dir\n    assert cache_dir is not None\n\n    test_file = ln.examples.datasets.anndata_file_pbmc68k_test()\n\n    # test cache for saving an in-memory object\n    adata = load_h5ad(test_file)\n\n    artifact = ln.Artifact.from_anndata(adata, key=\"test_cache.h5ad\")\n    temp_path = artifact._local_filepath.resolve()\n    assert cache_dir in temp_path.parents\n    artifact.save()\n    assert not temp_path.exists()\n    cloud_path = artifact.path\n    cache_path = artifact._cache_path\n    assert cache_path.exists()\n    assert (\n        cache_path == cache_dir / \"lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad\"\n    )\n    assert cloud_path.modified.timestamp() < cache_path.stat().st_mtime\n\n    artifact.delete(permanent=True)\n\n    # test cache for saving an on-disk object\n    artifact = ln.Artifact.from_anndata(test_file, key=\"test_cache.h5ad\")\n    artifact.save()\n    cloud_path = artifact.path\n    cache_path = artifact._cache_path\n    assert cache_path.exists()\n    assert (\n        cache_path == cache_dir / \"lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad\"\n    )\n    assert test_file.stat().st_mtime < cache_path.stat().st_mtime\n    assert cloud_path.modified.timestamp() < cache_path.stat().st_mtime\n\n    artifact.delete(permanent=True)\n\n    # test cache for a directory on-disk object outside the cache dir\n    adata_zarr_pth = Path(\"test_adata.zarr\")\n    adata.write_zarr(adata_zarr_pth)\n    artifact = ln.Artifact(adata_zarr_pth, key=\"test_cache.zarr\")\n    artifact.save()\n    assert adata_zarr_pth.is_dir()\n    cache_path = artifact._cache_path\n    assert cache_path.is_dir()\n    assert (\n        cache_path == cache_dir / \"lamindb-ci/lamindb-unit-tests-cloud/test_cache.zarr\"\n    )\n\n    shutil.rmtree(adata_zarr_pth)\n    artifact.delete(permanent=True)\n\n    # inside the cache dir\n    adata_zarr_pth = cache_dir / \"test_adata.zarr\"\n    adata.write_zarr(adata_zarr_pth)\n    artifact = ln.Artifact(adata_zarr_pth, key=\"test_cache.zarr\")\n    assert adata_zarr_pth.exists()\n    artifact.save()\n    assert not adata_zarr_pth.exists()\n    cache_path = artifact._cache_path\n    assert cache_path.is_dir()\n    assert (\n        cache_path == cache_dir / \"lamindb-ci/lamindb-unit-tests-cloud/test_cache.zarr\"\n    )\n\n    artifact.delete(permanent=True)\n\n\ndef test_cloud_cache_versions(switch_storage):\n    adata = load_h5ad(ln.examples.datasets.anndata_file_pbmc68k_test())\n\n    cache_dir = ln.setup.settings.cache_dir\n    assert cache_dir is not None\n\n    artifact = ln.Artifact.from_anndata(adata, key=\"test_cache.h5ad\")\n    assert ln.settings.cache_dir in artifact._local_filepath.parents\n    artifact.save()\n    cache_path_v1 = artifact.cache()\n    assert cache_path_v1.exists()\n    assert (\n        cache_path_v1\n        == cache_dir / \"lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad\"\n    )\n    cache_path_v1.unlink()\n    artifact.cache(print_progress=False)\n    assert cache_path_v1.exists()\n    assert (\n        cache_path_v1\n        == cache_dir / \"lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad\"\n    )\n    timestamp_v1 = cache_path_v1.stat().st_mtime\n    # hope it is enough to avoid random timestamp problems further\n    sleep(1)\n    # new version\n    adata.obs[\"test_cache\"] = \"test\"\n    artifact_v2 = ln.Artifact.from_anndata(\n        adata, key=\"test_cache.h5ad\", revises=artifact\n    )\n    assert ln.settings.cache_dir in artifact_v2._local_filepath.parents\n    artifact_v2.save()\n    assert artifact_v2.is_latest\n    assert not artifact.is_latest\n    cache_path_v2 = artifact_v2.cache()\n    assert cache_path_v2.exists()\n    assert (\n        cache_path_v2\n        == cache_dir / \"lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad\"\n    )\n    assert cache_path_v2.stat().st_mtime > timestamp_v1\n    cache_path_v2.unlink()\n    artifact_v2.cache(mute=True)\n    assert cache_path_v2.exists()\n    assert (\n        cache_path_v2\n        == cache_dir / \"lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad\"\n    )\n    assert \"test_cache\" in load_h5ad(cache_path_v2).obs.columns\n    cache_mtime = cache_path_v2.stat().st_mtime\n    assert cache_mtime == artifact_v2.path.modified.timestamp()\n    assert cache_mtime > timestamp_v1\n    # old version cache ignores key\n    cache_path_v1 = artifact.cache()\n    assert cache_path_v1.exists()\n    assert cache_path_v1.name == f\"{artifact.uid}.h5ad\"\n\n    artifact_v2.versions.delete(permanent=True)\n\n\ndef test_corrupted_cache_local():\n    filepath = ln.examples.datasets.anndata_file_pbmc68k_test()\n    artifact = ln.Artifact.from_anndata(filepath, key=\"test_corrupt_cache_local.h5ad\")\n    artifact.save()\n    # corrupt cache\n    with open(artifact._cache_path, \"r+b\") as f:\n        f.write(b\"corruption\")\n    # just raises an exception, nothing to re-sync on local\n    with pytest.raises(OSError):\n        artifact.load()\n    with pytest.raises(OSError):\n        artifact.open()\n\n    artifact.delete(permanent=True)\n\n\ndef test_corrupted_cache_cloud(switch_storage):\n    # check that we have cloud storage\n    assert ln.setup.settings.storage.root_as_str == switch_storage\n\n    filepath = ln.examples.datasets.anndata_file_pbmc68k_test()\n    artifact = ln.Artifact.from_anndata(filepath, key=\"test_corrupt_cache_cloud.h5ad\")\n    artifact.save()\n    # corrupt cache\n    # sleep not to reset cache mtime to a smaller value\n    # it is increased artificially on cache copying in save\n    # so due to lower granularity of cloud mtimes and fast code execution\n    # after the change cache mtime can become smaller than cloud mtime\n    sleep(1)\n    with open(artifact._cache_path, \"r+b\") as f:\n        f.write(b\"corruption\")\n    assert artifact._cache_path.stat().st_mtime > artifact.path.stat().st_mtime\n    # check that it is indeed corrupted\n    with pytest.raises(OSError):\n        load_h5ad(artifact.cache())\n    # should load successfully\n    artifact.load()\n    # check open also\n    assert artifact._cache_path.exists()\n    with open(artifact._cache_path, \"r+b\") as f:\n        f.write(b\"corruption\")\n    # should open successfully\n    with artifact.open():\n        pass\n    # corrupted cache has been deleted\n    assert not artifact._cache_path.exists()\n\n    artifact.delete(permanent=True)\n"
  },
  {
    "path": "tests/storage/test_connect_reconnect.py",
    "content": "import lamindb as ln\nimport pytest\n\n\ndef test_connect_reconnect():\n    # testuser2 needs write access lamin-site-assets because of a fluke\n    # in the legacy collaborator management, it seems\n    assert ln.setup.settings.user.handle == \"testuser2\"\n    ln.connect(\"lamindb-unit-tests-storage\")  # this is not changing anything\n    count1 = ln.Artifact.filter().count()\n    # a public instance that does not have bionty configured\n    ln.connect(\"laminlabs/lamin-site-assets\")\n    count2 = ln.Artifact.filter().count()\n    assert count1 != count2\n    with pytest.raises(ln.setup.errors.ModuleWasntConfigured):\n        import bionty as bt\n    ln.connect(\"lamindb-unit-tests-storage\")\n    import bionty as bt\n\n    count3 = bt.Gene.filter().count()\n    assert count2 != count3\n"
  },
  {
    "path": "tests/storage/test_storage_lifecycle.py",
    "content": "from pathlib import Path\n\nimport lamindb as ln\nimport pytest\nfrom lamindb_setup.core._hub_core import get_storage_records_for_instance\n\n\ndef check_storage_location_on_hub_exists(uid: str):\n    all_storage_records = get_storage_records_for_instance(\n        ln.setup.settings.instance._id\n    )\n    length = len([r for r in all_storage_records if r[\"lnid\"] == uid])\n    if length not in {0, 1}:\n        raise AssertionError(\n            f\"Expected 0 or 1 storage records for uid {uid}, found {length}.\"\n        )\n    return length == 1\n\n\ndef test_reference_storage_location(ccaplog):\n    ln.Artifact(\"s3://lamindata/iris_studies/study0_raw_images\")\n    assert ln.Storage.get(root=\"s3://lamindata\").instance_uid == \"4XIuR0tvaiXM\"\n    # assert (\n    #     \"referenced read-only storage location at s3://lamindata, is managed by instance with uid 4XIuR0tvaiXM\"\n    #     in ccaplog.text\n    # )\n\n\ndef test_switch_delete_storage_location():\n    ln.settings.storage = \"./default_storage_unit_storage\"\n    assert (\n        ln.settings.storage.root.resolve()\n        == Path(\"./default_storage_unit_storage\").resolve()\n    )\n    new_storage_location = \"s3://lamindb-ci/test-settings-switch-storage\"\n    ln.Storage(new_storage_location).save()\n    ln.settings.storage = new_storage_location\n    assert ln.setup.settings.storage.type_is_cloud\n    assert ln.setup.settings.storage.root_as_str == new_storage_location\n    # root.fs contains the underlying fsspec filesystem\n    # the following is set by lamindb to True for s3 by default\n    assert ln.setup.settings.storage.root.fs.cache_regions\n    ln.settings.storage = new_storage_location, {\"cache_regions\": False}\n    assert not ln.setup.settings.storage.root.fs.cache_regions\n    assert ln.setup.settings.storage.root.exists()\n\n    # now work with the new storage location\n    new_storage = ln.Storage.get(root=new_storage_location)\n    assert check_storage_location_on_hub_exists(new_storage.uid)\n    artifact = ln.Artifact(\".gitignore\", key=\"test_artifact\").save()\n    assert new_storage.root in artifact.path.as_posix()\n\n    # artifacts exist\n    with pytest.raises(AssertionError) as err:\n        new_storage.delete()\n    assert \"Cannot delete storage with artifacts in current instance.\" in err.exconly()\n\n    artifact.delete(permanent=True, storage=False)\n    # still some files in there\n    with pytest.raises(ln.setup.errors.StorageNotEmpty) as err:\n        new_storage.delete()\n    assert (\n        \"'s3://lamindb-ci/test-settings-switch-storage/.lamindb' contains 1 objects\"\n        in err.exconly()\n    )\n\n    # now delete the artifact so that the storage location is empty\n    artifact.path.unlink()\n    with pytest.raises(AssertionError) as err:\n        new_storage.delete()\n    assert (\n        \"Cannot delete the current storage location, switch to another.\"\n        in err.exconly()\n    )\n\n    # check all attempts unsuccessful so far\n    assert check_storage_location_on_hub_exists(new_storage.uid)\n\n    # switch back to default storage\n    ln.settings.storage = \"./default_storage_unit_storage\"\n    storage_marker = ln.UPath(new_storage_location) / \".lamindb/storage_uid.txt\"\n    assert storage_marker.exists()\n    new_storage.delete()\n    assert not check_storage_location_on_hub_exists(new_storage.uid)\n    assert not storage_marker.exists()\n"
  },
  {
    "path": "tests/storage/test_streaming.py",
    "content": "import gzip\nimport shutil\nfrom pathlib import Path\n\nimport anndata as ad\nimport h5py\nimport lamindb as ln\nimport numpy as np\nimport pandas as pd\nimport pytest\nimport zarr\nfrom lamindb.core.loaders import load_h5ad\nfrom lamindb.core.storage._anndata_accessor import _anndata_n_observations, _to_index\nfrom lamindb.core.storage._backed_access import (\n    _flat_suffixes,\n    backed_access,\n)\nfrom lamindb.core.storage._polars_lazy_df import _open_polars_lazy_df, _polars_options\nfrom lamindb.core.storage._pyarrow_dataset import _open_pyarrow_dataset\nfrom lamindb.core.storage._zarr import load_zarr\nfrom lamindb.core.storage.objects import infer_suffix, write_to_disk\n\n\n@pytest.fixture\ndef bad_adata_path():\n    fp = ln.examples.datasets.anndata_file_pbmc68k_test()\n    adata = load_h5ad(fp)\n    to = fp.with_name(\"pbmc68k_bad.h5ad\")\n    shutil.copy(fp, to)\n    fp = to\n    file = h5py.File(fp, mode=\"r+\")\n    for field_name in (\"obs\", \"var\"):\n        field = getattr(adata, field_name).to_records()\n        formats = []\n        for name, (dt, _) in field.dtype.fields.items():\n            if dt == \"O\":\n                new_dt = str(field[name].astype(str).dtype).replace(\"<U\", \"S\")\n            else:\n                new_dt = dt\n            formats.append((name, new_dt))\n        del file[field_name]\n        file.create_dataset(field_name, data=field.astype(formats))\n    del file[\"X\"].attrs[\"encoding-type\"]\n    del file[\"X\"].attrs[\"encoding-version\"]\n    del file[\"obsp\"][\"test\"].attrs[\"encoding-type\"]\n    del file[\"obsp\"][\"test\"].attrs[\"encoding-version\"]\n    file.close()\n    return fp\n\n\ndef test_anndata_io():\n    test_file = ln.examples.datasets.anndata_file_pbmc68k_test()\n\n    adata = load_h5ad(test_file)\n\n    zarr_path = test_file.with_suffix(\".zarr\")\n    adata.write_zarr(zarr_path)\n\n    adata = load_zarr(zarr_path, \"anndata\")\n\n    assert adata.shape == (30, 200)\n\n    shutil.rmtree(zarr_path)\n\n\n@pytest.mark.parametrize(\"adata_format\", [\"h5ad\", \"zarr\"])\ndef test_backed_access(adata_format):\n    fp = ln.UPath(ln.examples.datasets.anndata_file_pbmc68k_test())\n    if adata_format == \"zarr\":\n        adata = load_h5ad(fp)\n\n        fp = fp.with_suffix(\".zarr\")\n        adata.write_zarr(fp)\n        del adata\n        # remove encoding information to check correctness of backed accessor\n        store = zarr.open(fp)\n        del store[\"obsp\"][\"test\"].attrs[\"encoding-type\"]\n        del store[\"obsp\"][\"test\"].attrs[\"encoding-version\"]\n        del store[\"obsm\"][\"X_pca\"].attrs[\"encoding-type\"]\n        del store[\"obsm\"][\"X_pca\"].attrs[\"encoding-version\"]\n        del store\n\n    with pytest.raises(ValueError):\n        access = backed_access(fp.with_suffix(\".invalid_suffix\"), using_key=None)\n\n    # can't open anndata in write mode\n    with pytest.raises(ValueError):\n        access = backed_access(fp, mode=\"a\", using_key=None)\n\n    access = backed_access(fp, using_key=None)\n    assert not access.closed\n\n    assert isinstance(access.obs_names, pd.Index)\n    assert isinstance(access.var_names, pd.Index)\n    assert access.raw.shape == (30, 100)\n    assert access.obsp[\"test\"].to_memory().sum() == 30\n    assert access.varp[\"test\"].to_memory().sum() == 200\n    assert access.layers[\"test\"][0].sum() == 200\n\n    mask = np.full(access.shape[0], False, dtype=bool)\n    mask[:5] = True\n    assert access[mask].X.shape == (5, 200)\n\n    sub = access[:10]\n    assert sub[:5].shape == (5, 200)\n    assert sub.layers[\"test\"].shape == sub.shape\n    assert sub.raw.shape == (10, 100)\n    assert sub.obsp[\"test\"].sum() == 10\n    assert sub.varp[\"test\"].sum() == 200\n    assert sub.obsm[\"X_pca\"].shape == (10, 50)\n\n    with pytest.raises(AttributeError):\n        sub.raw.raw  # noqa: B018\n\n    assert access[:, [1, 2, 5]].varp[\"test\"].sum() == 3\n\n    obs_sub = [\"TCAATCACCCTTCG-8\", \"CGTTATACAGTACC-8\", \"TGCCAAGATTGTGG-7\"]\n    sub = access[obs_sub]\n    assert sub.obs_names.tolist() == obs_sub\n    assert sub.to_memory().shape == (3, 200)\n\n    # check with a bool mask\n    obs_mask = np.isin(access.obs_names, obs_sub)\n    sub = access[obs_mask]\n    assert sub.obs_names.tolist() == obs_sub\n    assert sub.to_memory().shape == (3, 200)\n\n    idx = np.array([1, 2, 5])\n    sub = access[idx]\n    assert sub.raw.shape == (3, 100)\n    assert sub.to_memory().shape == (3, 200)\n\n    var_sub = [\"SSU72\", \"PARK7\", \"RBP7\"]\n    sub = access[:, var_sub]\n    assert sub.var_names.tolist() == var_sub\n\n    assert access.to_memory().shape == (30, 200)\n    assert sub.to_memory().shape == (30, 3)\n\n    access.close()\n    assert access.closed\n    del access\n\n    with backed_access(fp, using_key=None) as access:\n        assert not access.closed\n        sub = access[:10]\n        assert sub[:5].shape == (5, 200)\n        assert sub.layers[\"test\"].shape == sub.shape\n    assert access.closed\n\n    with backed_access(fp, using_key=None) as access:\n        idx = np.array([3, 1, 2])\n        assert access[:, idx].to_memory().shape == (30, 3)\n        assert access[idx].to_memory().shape == (3, 200)\n\n    if adata_format == \"zarr\":\n        assert fp.suffix == \".zarr\"\n        shutil.rmtree(fp)\n\n\ndef test_add_column():\n    previous_storage = ln.setup.settings.storage.root_as_str\n    ln.settings.storage = \"s3://lamindb-test/storage\"\n\n    adata = load_h5ad(ln.examples.datasets.anndata_file_pbmc68k_test())\n    zarr_path = \"adata_write_mode.zarr\"\n    adata.write_zarr(zarr_path)\n\n    artifact = ln.Artifact(zarr_path, description=\"test add_column\").save()\n\n    access = artifact.open(mode=\"r+\")\n    n_obs, n_var = access.shape\n    access.add_column(\"obs\", \"ones_obs\", np.ones(n_obs))\n    access.add_column(\"var\", \"ones_var\", np.ones(n_var))\n    assert np.all(access.obs[\"ones_obs\"] == 1)\n    assert np.all(access.var[\"ones_var\"] == 1)\n    access.close()\n    assert artifact.uid.endswith(\"0001\")\n\n    cat_col = pd.Categorical([\"one\"] + [\"two\"] * (n_obs - 1))\n    with artifact.open(mode=\"r+\") as access:\n        access.add_column(\"obs\", \"cat_col\", cat_col)\n        assert access.obs[\"cat_col\"].cat.categories.to_list() == [\"one\", \"two\"]\n    assert artifact.uid.endswith(\"0002\")\n    # can't add in read mode\n    with pytest.raises(ValueError):\n        artifact.open().add_column(\"obs\", \"new_col\", cat_col)\n\n    artifact.delete(permanent=True)\n    shutil.rmtree(zarr_path)\n\n    ln.settings.storage = previous_storage\n\n\ndef test_to_index():\n    elem_int = np.arange(3, dtype=int)\n    elem_float = elem_int.astype(float)\n    elem_str = elem_int.astype(str)\n\n    assert _to_index(elem_int).dtype == \"object\"\n    assert _to_index(elem_float).dtype == \"object\"\n    assert _to_index(elem_str).dtype == \"object\"\n\n\ndef test_infer_suffix():\n    adata = ad.AnnData()\n    assert infer_suffix(adata, format=\"h5ad\") == \".h5ad\"\n    with pytest.raises(ValueError):\n        infer_suffix(adata, format=\"my format\")\n    with pytest.raises(NotImplementedError):\n        infer_suffix(ln.Artifact)\n\n\ndef test_write_to_disk():\n    with pytest.raises(NotImplementedError):\n        write_to_disk(ln.Artifact, \"path\")\n\n    df = pd.DataFrame({\"x\": [1, 2], \"y\": [3, 4]})\n    write_to_disk(df, \"write_to_disk.csv\")\n\n    file_on_disk = Path(\"write_to_disk.csv\")\n    assert file_on_disk.exists()\n\n    file_on_disk.unlink()\n\n\ndef test_backed_bad_format(bad_adata_path):\n    access = backed_access(bad_adata_path, using_key=None)\n\n    assert access.obsp[\"test\"].to_memory().sum() == 30\n\n    sub = access[:10]\n\n    assert sub.X.shape == (10, 200)\n    assert sub.obsp[\"test\"].sum() == 10\n\n    assert isinstance(sub.obs, pd.DataFrame)\n    assert isinstance(sub.var, pd.DataFrame)\n    assert isinstance(sub.obs_names, pd.Index)\n    assert isinstance(sub.var_names, pd.Index)\n\n    assert sub.to_memory().shape == (10, 200)\n\n    access.close()\n    bad_adata_path.unlink()\n\n\ndef test_backed_zarr_not_adata():\n    zarr_pth = Path(\"./not_adata.zarr\")\n    store = zarr.open(zarr_pth, mode=\"w\")\n    store[\"test\"] = np.array([\"test\"])\n\n    access = backed_access(zarr_pth)\n\n    assert type(access).__name__ == \"BackedAccessor\"\n    assert type(access).__module__ == \"lamindb.core.storage._backed_access\"\n    assert access.storage[\"test\"][...] == \"test\"\n\n    shutil.rmtree(zarr_pth)\n\n\ndef test_anndata_open_mode():\n    fp = ln.examples.datasets.anndata_file_pbmc68k_test()\n    artifact = ln.Artifact(fp, key=\"test_adata.h5ad\").save()\n\n    with artifact.open(mode=\"r\") as access:\n        # TODO: add back proper type checking once reset_django() is gone\n        assert type(access).__name__ == \"AnnDataAccessor\"\n        assert type(access).__module__ == \"lamindb.core.storage._anndata_accessor\"\n    # can't open in write mode if not tiledbsoma\n    with pytest.raises(ValueError):\n        artifact.open(mode=\"w\")\n\n    artifact.delete(permanent=True, storage=True)\n\n\ndef test_from_lazy():\n    # a different suffix in key\n    with pytest.raises(ValueError):\n        ln.Artifact.from_lazy(\n            suffix=\".zarr\", overwrite_versions=True, key=\"mydata.h5ad\"\n        )\n\n    lazy = ln.Artifact.from_lazy(\n        suffix=\".zarr\", overwrite_versions=True, key=\"mydata.zarr\"\n    )\n\n    store = zarr.open(lazy.path, mode=\"w\")\n    store[\"test\"] = np.array([\"test\"])\n\n    artifact = lazy.save()\n\n    path_str = artifact.path.as_posix()\n    assert \".lamindb\" in path_str\n    assert artifact.uid[:16] in path_str\n\n    access = artifact.open()\n    assert access.storage[\"test\"][...] == \"test\"\n\n    artifact.delete(permanent=True, storage=True)\n\n\ndef test_zarr_open_mode_overwrite_versions_false():\n    lazy = ln.Artifact.from_lazy(\n        suffix=\".zarr\", overwrite_versions=False, key=\"mydata_overwrite_false.zarr\"\n    )\n    store = zarr.open(lazy.path, mode=\"w\")\n    store[\"test\"] = np.array([\"test\"])\n    artifact = lazy.save()\n\n    with pytest.raises(ValueError, match=\"overwrite_versions=False\"):\n        artifact.open(mode=\"r+\")\n\n    artifact.delete(permanent=True, storage=True)\n\n\ndef test_from_lazy_cloud():\n    previous_storage = ln.setup.settings.storage.root_as_str\n    ln.settings.storage = \"s3://lamindb-test/storage\"\n    lazy = ln.Artifact.from_lazy(\n        suffix=\".zarr\", overwrite_versions=True, key=\"stream_test.zarr\"\n    )\n    store = zarr.storage.FsspecStore.from_url(lazy.path.as_posix())\n    group = zarr.open(store, mode=\"w\")\n    group[\"ones\"] = np.ones(3)\n    artifact = lazy.save()\n    access = artifact.open()\n    np.testing.assert_array_equal(access.storage[\"ones\"][...], np.ones(3))\n    artifact.delete(permanent=True, storage=True)\n    ln.settings.storage = previous_storage\n\n\ndef test_polars_options():\n    storepath = ln.UPath(\n        \"s3://bucket/key?endpoint_url=http://localhost:9000/s3\", anon=True\n    )\n    storage_options = _polars_options(storepath)[\"storage_options\"]\n    assert storage_options[\"aws_endpoint_url\"] == \"http://localhost:9000/s3\"\n    assert not storage_options[\"aws_virtual_hosted_style_request\"]\n    assert storage_options[\"aws_allow_http\"]\n    assert storage_options[\"aws_skip_signature\"]\n\n\ndef test_open_dataframe_artifact():\n    previous_storage = ln.setup.settings.storage.root_as_str\n    ln.settings.storage = \"s3://lamindb-test/storage\"\n    # open from managed bucket\n    artifact_remote = ln.Artifact.connect(\"laminlabs/lamin-dev\").get(\n        \"iw9RRhFApeJVHC1L0001\"\n    )\n    with artifact_remote.open(engine=\"polars\") as ldf:\n        assert ldf.collect().shape == (3, 5)\n    # test passing credentials directly\n    artifact_path = artifact_remote.path\n    aws_key = artifact_path.fs.session._credentials._access_key\n    aws_secret = artifact_path.fs.session._credentials._secret_key\n    aws_token = artifact_path.fs.session._credentials._token\n    test_path = ln.UPath(\n        artifact_path.as_posix(),\n        key=aws_key,\n        secret=aws_secret,\n        token=aws_token,\n    )\n    with _open_polars_lazy_df(test_path) as ldf:\n        assert ldf.collect().shape == (3, 5)\n\n    df = pd.DataFrame({\"feat1\": [0, 0, 1, 1], \"feat2\": [6, 7, 8, 9]})\n    # check as non-partitioned file\n    df.to_parquet(\"save_df.parquet\", engine=\"pyarrow\")\n    artifact_file = ln.Artifact(\n        \"save_df.parquet\", description=\"Test non-partitioned parquet\"\n    )\n    artifact_file.save()\n    # cached after saving\n    ds = artifact_file.open()\n    assert ds.to_table().to_pandas().equals(df)\n    # remove cache\n    artifact_file.cache().unlink()\n    # pyarrow\n    ds = artifact_file.open(engine=\"pyarrow\")\n    assert ds.to_table().to_pandas().equals(df)\n    # polars\n    with artifact_file.open(engine=\"polars\") as ldf:\n        assert ldf.collect().to_pandas().equals(df)\n    # wrong engine\n    with pytest.raises(ValueError) as err:\n        artifact_file.open(engine=\"some-other-engine\")\n    assert err.exconly().startswith(\"ValueError: Unknown engine\")\n    # check as partitioned folder\n    df.to_parquet(\"save_df\", engine=\"pyarrow\", partition_cols=[\"feat1\"])\n    assert Path(\"save_df\").is_dir()\n    artifact_folder = ln.Artifact(\"save_df\", description=\"Test partitioned parquet\")\n    artifact_folder.save()\n    # cached after saving\n    ds = artifact_folder.open()\n    assert ds.to_table().to_pandas().equals(df[[\"feat2\"]])\n    # remove cache\n    shutil.rmtree(artifact_folder.cache())\n    # pyarrow\n    ds = artifact_folder.open()\n    assert ds.to_table().to_pandas().equals(df[[\"feat2\"]])\n    # polars\n    with artifact_folder.open(engine=\"polars\") as ldf:\n        assert ldf.collect().to_pandas().equals(df[[\"feat2\"]])\n    with artifact_folder.open(engine=\"polars\", use_fsspec=True) as ldf:\n        assert ldf.collect().to_pandas().equals(df[[\"feat2\"]])\n\n    artifact_file.delete(permanent=True)\n    artifact_folder.delete(permanent=True)\n\n    ln.settings.storage = previous_storage\n\n\ndef test_open_dataframe_collection():\n    previous_storage = ln.setup.settings.storage.root_as_str\n    ln.settings.storage = \"s3://lamindb-test/storage\"\n\n    df = pd.DataFrame({\"feat1\": [0, 0, 1, 1], \"feat2\": [6, 7, 8, 9]})\n    shard1 = ln.UPath(\"df1.parquet\")\n    shard2 = ln.UPath(\"df2.parquet\")\n    df[:2].to_parquet(shard1, engine=\"pyarrow\")\n    df[2:].to_parquet(shard2, engine=\"pyarrow\")\n    # test checking and opening local paths\n    assert _flat_suffixes(shard1) == {\".parquet\"}\n    assert _flat_suffixes([shard1, ln.UPath(\"some.csv\")]) == {\".parquet\", \".csv\"}\n    assert _open_pyarrow_dataset([shard1, shard2]).to_table().to_pandas().equals(df)\n\n    ln.examples.datasets.file_mini_csv()\n\n    artifact1 = ln.Artifact(shard1, key=\"df1.parquet\").save()\n    artifact2 = ln.Artifact(shard2, key=\"df2.parquet\").save()\n    artifact3 = ln.Artifact(\"mini.csv\", key=\"mini.csv\").save()\n    artifact4 = ln.Artifact(\n        \"https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv\"\n    ).save()\n\n    collection1 = ln.Collection([artifact1, artifact2], key=\"parquet_col\")\n    # before saving\n    # engine=\"pyarrow\" by default\n    assert collection1.open().to_table().to_pandas().equals(df)\n    # after saving\n    collection1.save()\n    # pyarrow\n    assert collection1.open(engine=\"pyarrow\").to_table().to_pandas().equals(df)\n    # polars\n    with collection1.open(engine=\"polars\") as ldf:\n        assert ldf.collect().to_pandas().equals(df)\n    with collection1.open(engine=\"polars\", use_fsspec=True) as ldf:\n        assert ldf.collect().to_pandas().equals(df)\n    # wrong engine\n    with pytest.raises(ValueError) as err:\n        collection1.open(engine=\"some-other-engine\")\n    assert err.exconly().startswith(\"ValueError: Unknown engine\")\n    # different file formats\n    collection2 = ln.Collection([artifact1, artifact3], key=\"parquet_csv_col\").save()\n    with pytest.raises(ValueError) as err:\n        collection2.open()\n    assert err.exconly().startswith(\n        \"ValueError: The artifacts in the collection have different file formats\"\n    )\n    # different filesystems with pyarrow\n    collection3 = ln.Collection([artifact3, artifact4], key=\"s3_http_col\").save()\n    with pytest.raises(ValueError) as err:\n        collection3.open()\n    assert err.exconly().startswith(\n        \"ValueError: The collection has artifacts with different filesystems, this is not supported\"\n    )\n\n    shard1.unlink()\n    shard2.unlink()\n\n    collection1.delete(permanent=True)\n    collection2.delete(permanent=True)\n    collection3.delete(permanent=True)\n\n    artifact1.delete(permanent=True)\n    artifact2.delete(permanent=True)\n    artifact3.delete(permanent=True)\n    artifact4.delete(permanent=True, storage=False)\n\n    Path(\"mini.csv\").unlink(missing_ok=True)\n\n    ln.settings.storage = previous_storage\n\n\ndef test_backed_wrong_suffix():\n    fp = Path(\"test_file.txt\")\n    fp.write_text(\"test open with wrong suffix\")\n\n    artifact = ln.Artifact(fp, description=\"Test open wrong suffix\")\n    # do not save here, it just tries to open the local path\n    with pytest.raises(ValueError):\n        artifact.open()\n\n    fp.unlink()\n\n\ndef test_anndata_n_observations(bad_adata_path):\n    assert _anndata_n_observations(bad_adata_path) == 30\n\n    assert _anndata_n_observations(\"./path_does_not_exist.h5ad\") is None\n    assert _anndata_n_observations(\"./path_does_not_exist.zarr\") is None\n\n    corrupted_path = Path(\"./corrupted.h5ad\")\n    shutil.copy(bad_adata_path, corrupted_path)\n    with h5py.File(corrupted_path, mode=\"r+\") as f:\n        del f[\"obs\"]\n        assert \"obs\" not in f\n    assert _anndata_n_observations(corrupted_path) is None\n    corrupted_path.unlink()\n\n    adata = ln.examples.datasets.anndata_pbmc68k_reduced()\n    assert _anndata_n_observations(adata) == adata.n_obs\n    zarr_path = \"./test_adata_n_obs.zarr\"\n    adata.write_zarr(zarr_path)\n    assert _anndata_n_observations(zarr_path) == adata.n_obs\n\n    del zarr.open(zarr_path, mode=\"r+\")[\"obs\"].attrs[\"_index\"]\n    assert _anndata_n_observations(zarr_path) == adata.n_obs\n\n    shutil.rmtree(zarr_path)\n\n\ndef _compress(input_filepath, output_filepath):\n    with open(input_filepath, \"rb\") as f_in:\n        with gzip.open(output_filepath, \"wb\") as f_out:\n            shutil.copyfileobj(f_in, f_out)\n\n\n@pytest.mark.parametrize(\"gz_suffix\", [\".gz\", \".tar.gz\"])\ndef test_compressed(gz_suffix):\n    adata_f = ln.examples.datasets.anndata_file_pbmc68k_test()\n    adata_gz = adata_f.with_suffix(adata_f.suffix + gz_suffix)\n    _compress(adata_f, adata_gz)\n\n    artifact = ln.Artifact.from_anndata(adata_gz, key=\"adata.h5ad\" + gz_suffix).save()\n    assert artifact.n_observations == 30\n\n    with artifact.open() as store:\n        assert type(store).__name__ == \"AnnDataAccessor\"\n        assert type(store).__module__ == \"lamindb.core.storage._anndata_accessor\"\n\n    assert isinstance(artifact.load(), ad.AnnData)\n\n    with pytest.raises(OSError):\n        artifact.open(compression=None)\n\n    artifact.delete(permanent=True)\n    adata_gz.unlink()\n"
  },
  {
    "path": "tests/storage/test_transfer.py",
    "content": "from unittest.mock import patch\n\nimport bionty as bt\nimport lamindb as ln\nimport pytest\nfrom lamindb.models._django import get_artifact_or_run_with_related\n\n\ndef test_describe_artifact_from_remote_instance(capsys):\n    # test describing from a remote instance with less modules\n    artifact = ln.Artifact.connect(\"laminlabs/lamin-site-assets\").first()\n    artifact.describe()\n    captured = capsys.readouterr()\n    assert len(captured.out) > 50\n    assert \"artifact\" in captured.out.lower()\n\n\ndef test_transfer_from_remote_to_local(ccaplog):\n    \"\"\"Test transfer from remote to local instance.\"\"\"\n\n    bt.Gene.filter().delete(permanent=True)\n    bt.Organism.filter().delete(permanent=True)\n    ln.ULabel.filter().delete(permanent=True)\n    bt.CellType.filter().delete(permanent=True)\n\n    # test transfer from an instance with an extra schema module: pertdb\n    # we also made sure that the artifact here has a pertdb label attached\n\n    # transfer 1st artifact\n    artifact1 = ln.Artifact.connect(\"laminlabs/lamin-dev\").get(\"livFRRpM\")\n\n    # test describe postgres\n    result = get_artifact_or_run_with_related(\n        artifact1,\n        include_m2m=True,\n        include_fk=True,\n        include_feature_link=True,\n        include_schema=True,\n    )\n    assert result[\"related_data\"][\"m2m\"][\"tissues\"] == {\n        2: {\n            \"id\": 2,\n            \"uid\": \"6VHBo6XsJZqmaQ\",\n            \"abbr\": None,\n            \"name\": \"cortex of kidney\",\n            \"tissue\": 2,\n            \"feature\": None,\n            \"ontology_id\": \"UBERON:0001225\",\n            \"tissue_display\": \"cortex of kidney\",\n        }\n    }\n    assert sorted(\n        result[\"related_data\"][\"link\"][\"links_ulabel\"], key=lambda d: d[\"id\"]\n    ) == [\n        {\n            \"id\": 7,\n            \"uid\": \"ydyPUMjh\",\n            \"name\": \"donor_24\",\n            \"ulabel\": 15,\n            \"feature\": 1,\n            \"reference\": None,\n            \"reference_type\": None,\n            \"ulabel_display\": \"donor_24\",\n        },\n        {\n            \"id\": 8,\n            \"uid\": \"JJ3d8a2v\",\n            \"name\": \"na\",\n            \"ulabel\": 10,\n            \"feature\": 10,\n            \"reference\": None,\n            \"reference_type\": None,\n            \"ulabel_display\": \"na\",\n        },\n    ]\n    assert result[\"related_data\"][\"m2m_schemas\"][615][0] == \"obs\"\n    assert result[\"related_data\"][\"m2m_schemas\"][615][1] == {\n        \"Feature\": [\n            \"donor_id\",\n            \"development_stage\",\n            \"disease\",\n            \"cell_type\",\n            \"sex\",\n            \"assay\",\n            \"tissue\",\n            \"self_reported_ethnicity\",\n            \"tissue_type\",\n            \"suspension_type\",\n            \"organism\",\n        ]\n    }\n    assert result[\"related_data\"][\"fk\"][\"storage\"] == {\n        \"id\": 4,\n        \"name\": \"s3://cellxgene-data-public\",\n    }\n\n    id_remote = artifact1.id\n    run_remote = artifact1.run\n    transform_remote = artifact1.transform\n    created_by_remote = artifact1.created_by\n    storage_remote = artifact1.storage\n    organism_remote = artifact1.organisms.get(name=\"human\")\n\n    artifact1.save(transfer=\"annotations\")\n    # assert MODULE_WASNT_CONFIGURED_MESSAGE_TEMPLATE.format(\"pertdb\") in ccaplog.text\n\n    # check all ids are adjusted\n    assert id_remote != artifact1.id\n    assert run_remote != artifact1.run\n    assert transform_remote != artifact1.transform\n    assert created_by_remote.handle != artifact1.created_by.handle\n    assert storage_remote.uid == artifact1.storage.uid\n    assert storage_remote.created_at == artifact1.storage.created_at\n    organism = artifact1.organisms.get(name=\"human\")\n    assert organism.created_at != organism_remote.created_at\n\n    # now check that this is idempotent and we can run it again\n    artifact_repeat = ln.Artifact.connect(\"laminlabs/lamin-dev\").get(\n        \"livFRRpMaOgb3y8U2mK2\"\n    )\n    artifact_repeat.save(transfer=\"annotations\")\n\n    # now prepare a new test case\n    # mimic we have an existing feature with a different uid but same name\n    feature = ln.Feature.get(name=\"organism\")\n    feature.uid = \"existing\"\n    feature.save()\n\n    # transfer 2nd artifact\n    artifact2 = ln.Artifact.connect(\"laminlabs/lamin-dev\").get(\"qz35YaRk\")\n    artifact2.save(transfer=\"annotations\")\n\n    # check the feature name\n    assert artifact2.organisms.get(name=\"mouse\")\n    assert (\n        artifact1.features.slots[\"obs\"].members.get(name=\"organism\").uid == \"existing\"\n    )\n\n    # test transfer from an instance with fewer modules (laminlabs/lamin-site-assets)\n    artifact3 = ln.Artifact.connect(\"laminlabs/lamin-site-assets\").get(\n        \"lgRNHNtMxjU0y8nIagt7\"\n    )\n    # test that implicit saving through `load()` works (also occurs for `cache()` or `open()` for run input tracking)\n    artifact3.load()\n\n    # delete with storage=False, because these are all stored in the source instances\n    artifact1.delete(storage=False, permanent=True)\n    artifact2.delete(storage=False, permanent=True)\n    artifact3.delete(\n        storage=False\n    )  # there is an issue here with permanent deletion because of schema module mismatch\n\n\ndef test_transfer_into_space():\n    # grab any ulabel from the default space\n    ulabel = ln.ULabel.connect(\"laminlabs/lamin-dev\").filter(space__id=1).first()\n\n    space = ln.Space(name=\"space for transfer\", uid=\"00000123\").save()\n    with patch.object(ln.context, \"_space\", new=space):\n        ulabel.save()\n    assert ulabel.space_id == space.id\n\n    ulabel.delete(permanent=True)\n    space.delete()\n\n\ndef test_using_record_organism():\n    \"\"\"Test passing record and organism to the using_key instance.\"\"\"\n    import bionty as bt\n\n    release_110_cxg = bt.Source.connect(\"laminlabs/lamin-dev\").get(\n        organism=\"mouse\", entity=\"bionty.Gene\", version=\"release-110\"\n    )\n    release_112_cxg = bt.Source.connect(\"laminlabs/lamin-dev\").get(\n        organism=\"mouse\", entity=\"bionty.Gene\", version=\"release-112\"\n    )\n    release_110 = release_110_cxg.save()  # transfer source record\n    release_110_cxg = (  # re-fetch\n        bt.Source.connect(\"laminlabs/lamin-dev\").get(\n            organism=\"mouse\", entity=\"bionty.Gene\", version=\"release-110\"\n        )\n    )\n\n    # passing the wrong source\n    inspector = bt.Gene.connect(\"laminlabs/lamin-dev\").inspect(\n        [\"ENSMUSG00000102862\", \"ENSMUSG00000084826\"],\n        field=bt.Gene.ensembl_gene_id,\n        source=release_112_cxg,\n        strict_source=True,\n    )\n    assert len(inspector.validated) == 0\n\n    # passing the correct source\n    inspector = bt.Gene.connect(\"laminlabs/lamin-dev\").inspect(\n        [\"ENSMUSG00000102862\", \"ENSMUSG00000084826\"],\n        field=bt.Gene.ensembl_gene_id,\n        source=release_110_cxg,\n        strict_source=True,\n    )\n    assert len(inspector.validated) == 2\n\n    # passing the correct source but from the wrong instance\n    with pytest.raises(ValueError) as error:\n        inspector = bt.Gene.connect(\"laminlabs/lamin-dev\").inspect(\n            [\"ENSMUSG00000102862\", \"ENSMUSG00000084826\"],\n            field=bt.Gene.ensembl_gene_id,\n            source=release_110,\n        )\n    assert (\n        \"record must be a bionty.Source record from instance 'laminlabs/lamin-dev'\"\n        in str(error.value)\n    )\n\n\ndef test_using_query_by_feature():\n    assert ln.Artifact.connect(\"laminlabs/cellxgene\").filter(n_of_donors__gte=100)\n\n\n# TODO: uncomment after migrations\n# def test_transfer_features_uid():\n#     \"\"\"Test that a new feature is created based on uid.\"\"\"\n#     existing_tissue_feature = (\n#         ln.Feature.connect(\"laminlabs/lamin-dev\").get(name=\"tissue\").save()\n#     )\n#     artifact = ln.Artifact.connect(\"laminlabs/pertdata\").get(\"aT2dp4hC6XDwrafN\")\n#     artifact.save(transfer=\"annotations\")\n#     # now a new feature called \"tissue\" is created because the uid is different\n#     newly_transferred_tissue_feature = ln.Feature.get(\n#         name=\"tissue\", schemas__artifacts__uid=artifact.uid\n#     )\n#     assert existing_tissue_feature.uid != newly_transferred_tissue_feature.uid\n"
  },
  {
    "path": "tests/tiledbsoma/conftest.py",
    "content": "import os\nimport shutil\nfrom pathlib import Path\nfrom time import perf_counter\n\nimport lamindb as ln\nimport lamindb_setup as ln_setup\nimport numpy as np\nimport pandas as pd\nimport pytest\nfrom lamin_utils import logger\n\n\ndef pytest_sessionstart():\n    t_execute_start = perf_counter()\n    ln_setup._TESTING = True\n    os.environ[\"LAMIN_TESTING\"] = \"true\"\n    os.environ[\"LAMINDB_TEST_DB_VENDOR\"] = \"sqlite\"\n    print(\"running tests on SQLite\")\n    ln.setup.init(\n        storage=\"./default_storage_tiledbsoma\",\n        modules=\"bionty\",\n        name=\"lamindb-unit-tests-tiledbsoma\",\n    )\n    ln.settings.creation.artifact_silence_missing_run_warning = True\n    # Pre-register remote roots used in tests so `ln.settings.storage = ...`\n    # doesn't prompt for interactive confirmation under pytest capture.\n    ln.Storage(\"s3://lamindb-test/tiledbsoma\").save()\n    total_time_elapsed = perf_counter() - t_execute_start\n    print(f\"time to setup the instance: {total_time_elapsed:.1f}s\")\n\n\ndef pytest_sessionfinish(session: pytest.Session):\n    logger.set_verbosity(1)\n    if Path(\"./default_storage_tiledbsoma\").exists():\n        shutil.rmtree(\"./default_storage_tiledbsoma\")\n    upath = ln_setup.core.upath.UPath(\"s3://lamindb-test/tiledbsoma\")\n    if upath.exists():\n        upath.rmdir()\n    ln.setup.delete(\"lamindb-unit-tests-tiledbsoma\", force=True)\n    del os.environ[\"LAMIN_TESTING\"]\n\n\n@pytest.fixture(scope=\"session\")\ndef adata_file():\n    import anndata as ad\n\n    adata = ad.AnnData(\n        X=np.array([[1, 2, 3], [4, 5, 6]]),\n        obs={\"feat1\": [\"A\", \"B\"]},\n        var=pd.DataFrame(index=[\"MYC\", \"TCF7\", \"GATA1\"]),\n        obsm={\"X_pca\": np.array([[1, 2], [3, 4]])},\n    )\n    filepath = Path(\"adata_file.h5ad\")\n    adata.write(filepath)\n    yield \"adata_file.h5ad\"\n    filepath.unlink(missing_ok=True)\n\n\n@pytest.fixture(scope=\"function\")\ndef clean_soma_files(request):\n    path = request.param if hasattr(request, \"param\") else \"small_dataset.tiledbsoma\"\n    if Path(path).exists():\n        shutil.rmtree(path)\n\n    yield path\n\n    if Path(path).exists():\n        shutil.rmtree(path)\n\n\n@pytest.fixture(scope=\"function\")\ndef soma_experiment_file(clean_soma_files):\n    import tiledbsoma.io\n\n    adata = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\n    tiledbsoma.io.from_anndata(\"test.tiledbsoma\", adata, measurement_name=\"RNA\")\n    yield \"test.tiledbsoma\"\n    if Path(\"test.tiledbsoma\").exists():\n        shutil.rmtree(\"test.tiledbsoma\")\n"
  },
  {
    "path": "tests/tiledbsoma/test_artifact_basics.py",
    "content": "import lamindb as ln\nimport pytest\nfrom lamindb.models.artifact import data_is_soma_experiment\n\n\ndef test_create_from_soma_experiment(soma_experiment_file, adata_file):\n    with pytest.raises(ValueError) as error:\n        ln.Artifact.from_tiledbsoma(adata_file, description=\"test1\")\n    assert (\n        \"data has to be a SOMA Experiment object or a path to SOMA Experiment store.\"\n        in error.exconly()\n    )\n\n    af = ln.Artifact.from_tiledbsoma(soma_experiment_file, description=\"test1\")\n    assert af.description == \"test1\"\n    assert af.key is None\n    assert af.otype == \"tiledbsoma\"\n    assert af.n_observations == 3\n\n\ndef test_data_is_soma_experiment_paths():\n    assert data_is_soma_experiment(\"something.tiledbsoma\")\n    assert data_is_soma_experiment(ln.UPath(\"something.tiledbsoma\"))\n\n\ndef test_data_is_soma_experiment(soma_experiment_file):\n    import tiledbsoma\n\n    with tiledbsoma.Experiment.open(soma_experiment_file) as store:\n        assert data_is_soma_experiment(store)\n"
  },
  {
    "path": "tests/tiledbsoma/test_curators.py",
    "content": "import shutil\n\nimport bionty as bt\nimport lamindb as ln\nimport pytest\nimport tiledbsoma\nimport tiledbsoma.io\n\n\ndef test_tiledbsoma_curator(clean_soma_files):\n    \"\"\"Test TiledbSomaExperimentCurator with schema.\"\"\"\n    obs_schema = ln.Schema(\n        features=[\n            ln.Feature(name=\"cell_type_by_expert\", dtype=bt.CellType).save(),\n            ln.Feature(name=\"cell_type_by_model\", dtype=bt.CellType).save(),\n        ],\n    ).save()\n\n    var_schema = ln.Schema(\n        features=[\n            ln.Feature(name=\"var_id\", dtype=bt.Gene.ensembl_gene_id).save(),\n        ],\n        coerce=True,\n    ).save()\n\n    soma_schema = ln.Schema(\n        otype=\"tiledbsoma\",\n        slots={\n            \"obs\": obs_schema,\n            \"ms:RNA\": var_schema,\n        },\n    ).save()\n\n    # Convert AnnData to SOMA format\n    adata = ln.examples.datasets.mini_immuno.get_dataset1(otype=\"AnnData\")\n    tiledbsoma.io.from_anndata(\n        \"small_dataset.tiledbsoma\", adata, measurement_name=\"RNA\"\n    )\n\n    # Test with invalid dataset\n    with pytest.raises(ln.errors.InvalidArgument) as e:\n        ln.curators.TiledbsomaExperimentCurator(adata, soma_schema)\n    assert \"dataset must be SOMAExperiment-like.\" in str(e.value)\n\n    # Test with invalid schema\n    with tiledbsoma.Experiment.open(\"small_dataset.tiledbsoma\") as experiment:\n        with pytest.raises(ln.errors.InvalidArgument) as e:\n            ln.curators.TiledbsomaExperimentCurator(experiment, schema=var_schema)\n        assert \"Schema otype must be 'tiledbsoma'.\" in str(e.value)\n\n    with tiledbsoma.Experiment.open(\"small_dataset.tiledbsoma\") as experiment:\n        curator = ln.curators.TiledbsomaExperimentCurator(experiment, soma_schema)\n\n        assert \"obs\" in curator.slots\n        assert \"ms:RNA\" in curator.slots\n\n        curator.validate()\n\n        artifact = curator.save_artifact(\n            key=\"examples/soma_experiment.tiledbsoma\",\n            description=\"SOMA experiment with schema validation\",\n        )\n\n        assert artifact.schema == soma_schema\n        assert \"obs\" in artifact.features.slots\n        assert \"ms:RNA\" in artifact.features.slots\n\n        # Check feature values are properly annotated\n        assert set(artifact.features.get_values()[\"cell_type_by_expert\"]) == {\n            \"CD8-positive, alpha-beta T cell\",\n            \"B cell\",\n        }\n        assert set(artifact.features.get_values()[\"cell_type_by_model\"]) == {\n            \"T cell\",\n            \"B cell\",\n        }\n\n    # Altered data (gene typo)\n    adata_typo = ln.examples.datasets.mini_immuno.get_dataset1(\n        otype=\"AnnData\", with_gene_typo=True\n    )\n    typo_soma_path = \"./mini_immuno_dataset1_typo.tiledbsoma\"\n    tiledbsoma.io.from_anndata(typo_soma_path, adata_typo, measurement_name=\"RNA\")\n    with tiledbsoma.Experiment.open(typo_soma_path) as experiment_typo:\n        curator_typo = ln.curators.TiledbsomaExperimentCurator(\n            experiment_typo, soma_schema\n        )\n\n        # Validation should fail due to typo\n        with pytest.raises(ln.errors.ValidationError) as error:\n            curator_typo.validate()\n        assert \"GeneTypo\" in str(error.value)\n\n    # Clean up\n    shutil.rmtree(typo_soma_path)\n    artifact.delete(permanent=True)\n    soma_schema.delete(permanent=True)\n    var_schema.delete(permanent=True)\n    obs_schema.delete(permanent=True)\n"
  },
  {
    "path": "tests/tiledbsoma/test_storage.py",
    "content": "import shutil\nfrom pathlib import Path\n\nimport lamindb as ln\nimport numpy as np\nimport pytest\nimport tiledbsoma\nimport tiledbsoma.io\nfrom lamindb.core.loaders import load_h5ad\nfrom lamindb.core.storage._tiledbsoma import (\n    SOMAS3ContextFactory,\n    _open_tiledbsoma,\n    _soma_store_n_observations,\n)\nfrom lamindb.integrations import save_tiledbsoma_experiment\n\n\n@pytest.mark.parametrize(\"storage\", [None, \"s3://lamindb-test/tiledbsoma\"])\ndef test_write_read_tiledbsoma(storage):\n    if storage is not None:\n        previous_storage = ln.setup.settings.storage.root_as_str\n        ln.settings.storage = storage\n\n    test_file = ln.examples.datasets.anndata_file_pbmc68k_test()\n    adata = load_h5ad(test_file)\n    # write less\n    adata = adata[:5, :2].copy()\n    del adata.varp\n    del adata.obsp\n    del adata.layers\n    del adata.uns  # seems to cause problems for append\n    if storage is None:\n        # test local with zarr\n        test_file = test_file.with_suffix(\".zarr\")\n        adata.write_zarr(test_file)\n    else:\n        adata.write_h5ad(test_file)\n\n    create_transform = ln.Transform(key=\"test create tiledbsoma store\").save()\n    create_run = ln.Run(create_transform).save()\n\n    # fails with a view\n    with pytest.raises(ValueError, match=\"Can not write an `AnnData` view\"):\n        save_tiledbsoma_experiment([adata[:2]], run=create_run, measurement_name=\"RNA\")\n\n    artifact_soma = save_tiledbsoma_experiment(\n        [test_file],\n        description=\"test tiledbsoma\",\n        key=\"scrna/my-big-dataset.tiledbsoma\",  # can also be None, but that's trivial\n        run=create_run,\n        measurement_name=\"RNA\",\n    )\n    assert artifact_soma.path.stem == artifact_soma.uid[:16]\n    assert artifact_soma.key == \"scrna/my-big-dataset.tiledbsoma\"\n    assert artifact_soma.suffix == \".tiledbsoma\"\n    assert artifact_soma._key_is_virtual\n    assert artifact_soma.otype == \"tiledbsoma\"\n    assert artifact_soma.n_observations == adata.n_obs\n\n    with artifact_soma.open() as store:  # mode=\"r\" by default\n        assert isinstance(store, tiledbsoma.Experiment)\n        obs = store[\"obs\"]\n        n_obs = len(obs)\n        assert n_obs == adata.n_obs\n        assert \"lamin_run_uid\" in obs.schema.names\n        run_ids = (\n            obs.read(column_names=[\"lamin_run_uid\"])\n            .concat()\n            .to_pandas()[\"lamin_run_uid\"]\n        )\n        assert all(run_ids == create_run.uid)\n        assert set(run_ids.cat.categories) == {create_run.uid}\n        # test reading X\n        ms_rna = store.ms[\"RNA\"]\n        n_vars = len(ms_rna.var)\n        assert n_vars == adata.n_vars\n        X = ms_rna[\"X\"][\"data\"].read().coos((n_obs, n_vars)).concat().to_scipy()\n        assert X.sum() == adata.X.sum()\n\n    cache_path = artifact_soma.cache()\n    hash_before_changes = artifact_soma.hash\n    with artifact_soma.open(mode=\"w\") as store:\n        assert store.__class__.__name__ == \"ExperimentTrack\"\n        tiledbsoma.io.add_matrix_to_collection(\n            exp=store,\n            measurement_name=\"RNA\",\n            collection_name=\"obsm\",\n            matrix_name=\"test_array\",\n            matrix_data=np.ones((n_obs, 2)),\n        )\n    assert artifact_soma.hash != hash_before_changes\n    assert artifact_soma.uid.endswith(\"0001\")\n    if storage is not None:\n        # cache should be ignored and deleted after the changes\n        assert not cache_path.exists()\n    else:\n        assert artifact_soma.path == cache_path\n\n    adata_to_append_1 = adata[:3].copy()\n    adata_to_append_1.obs[\"obs_id\"] = adata_to_append_1.obs.index.to_numpy() + \"***\"\n    adata_to_append_1.var[\"var_id\"] = adata_to_append_1.var.index\n    adata_to_append_2 = adata[3:5].copy()\n    adata_to_append_2.obs[\"obs_id\"] = adata_to_append_2.obs.index.to_numpy() + \"***\"\n    adata_to_append_2.var[\"var_id\"] = adata_to_append_2.var.index\n    adata_to_append_2.write_h5ad(\"adata_to_append_2.h5ad\")\n\n    append_transform = ln.Transform(key=\"test append tiledbsoma store\").save()\n    append_run = ln.Run(append_transform).save()\n\n    # here run should be passed\n    with pytest.raises(ValueError, match=\"Pass `run`\"):\n        save_tiledbsoma_experiment(\n            [adata_to_append_1],\n            revises=artifact_soma,\n            run=None,\n            measurement_name=\"RNA\",\n        )\n\n    artifact_soma_append = save_tiledbsoma_experiment(\n        [adata_to_append_1, \"adata_to_append_2.h5ad\"],\n        revises=artifact_soma,\n        run=append_run,\n        measurement_name=\"RNA\",\n        append_obsm_varm=True,\n    )\n    assert artifact_soma_append.uid.endswith(\"0002\")\n    artifact_soma.refresh_from_db()\n    assert not artifact_soma.is_latest\n    match = \"its files were overwritten and are no longer available\"\n    with pytest.raises(ValueError, match=match):\n        artifact_soma.open()\n    with pytest.raises(ValueError, match=match):\n        artifact_soma.load()\n    with pytest.raises(ValueError, match=match):\n        artifact_soma.cache()\n    # below is inherited from \"scrna/my-big-dataset.tiledbsoma\"\n    assert artifact_soma_append.key == \"scrna/my-big-dataset.tiledbsoma\"\n\n    # wrong mode, should be either r or w for tiledbsoma\n    with pytest.raises(ValueError):\n        artifact_soma_append.open(mode=\"p\")\n\n    # test running without the context manager\n    store = artifact_soma_append.open()\n    n_obs_final = adata.n_obs + sum(\n        adt.n_obs for adt in [adata_to_append_1, adata_to_append_2]\n    )\n    obs = store[\"obs\"]\n    assert len(obs) == n_obs_final == artifact_soma_append.n_observations\n    run_ids = (\n        obs.read(column_names=[\"lamin_run_uid\"])\n        .concat()\n        .to_pandas()[\"lamin_run_uid\"]\n        .cat.categories\n    )\n    assert set(run_ids) == {create_run.uid, append_run.uid}\n    store.close()\n\n    # test correctness of deletion for _overwrite_versions=True\n    soma_path = artifact_soma_append.path\n    assert soma_path.exists()\n    # select specific version and delete\n    # check that the store is stil there\n    assert soma_path.exists()\n    assert ln.Artifact.filter(description=\"test tiledbsoma\").count() == 3\n    artifact_soma_append.versions.filter(uid__endswith=\"0001\").one().delete(\n        permanent=True\n    )\n    assert soma_path.exists()\n    assert ln.Artifact.filter(description=\"test tiledbsoma\").count() == 2\n    # make sure it the store is actually deleted\n    artifact_soma_append.delete(permanent=True)\n    assert not soma_path.exists()\n    assert not ln.Artifact.filter(description=\"test tiledbsoma\").exists()\n\n    Path(\"adata_to_append_2.h5ad\").unlink()\n\n    if storage is not None:\n        ln.settings.storage = previous_storage\n\n\ndef test_from_tiledbsoma():\n    test_file = ln.examples.datasets.anndata_file_pbmc68k_test()\n    soma_path = \"mystore.tiledbsoma\"\n    tiledbsoma.io.from_h5ad(soma_path, test_file, measurement_name=\"RNA\")\n    # wrong suffix\n    with pytest.raises(ValueError):\n        ln.Artifact.from_tiledbsoma(\"mystore\")\n\n    artifact = ln.Artifact.from_tiledbsoma(\n        soma_path, description=\"test soma store\"\n    ).save()\n    assert artifact.n_observations == 30\n\n    with _open_tiledbsoma(artifact.path, mode=\"r\") as store:\n        # experiment\n        assert _soma_store_n_observations(store) == 30\n        # dataframe\n        assert _soma_store_n_observations(store.obs) == 30\n        # treat as unstructured collection, data + raw\n        assert _soma_store_n_observations(store.ms) == 60\n        # measurement\n        assert _soma_store_n_observations(store.ms[\"RNA\"]) == 30\n        # array\n        assert _soma_store_n_observations(store.ms[\"RNA\"][\"X\"][\"data\"]) == 30\n\n    artifact.delete(permanent=True)\n    shutil.rmtree(soma_path)\n\n\ndef test_tiledb_config():\n    storepath = ln.UPath(\"s3://bucket/key?endpoint_url=http://localhost:9000/s3\")\n    tiledb_config = SOMAS3ContextFactory(storepath).get_context().tiledb_config\n    assert tiledb_config[\"vfs.s3.endpoint_override\"] == \"localhost:9000/s3\"\n    assert tiledb_config[\"vfs.s3.scheme\"] == \"http\"\n    assert tiledb_config[\"vfs.s3.use_virtual_addressing\"] == \"false\"\n    assert tiledb_config[\"vfs.s3.region\"] == \"\"\n\n\ndef test_tiledbsoma_in_managed_storage():\n    artifact = ln.Artifact.connect(\"laminlabs/lamindata\").get(\n        key=\"example_datasets/small_dataset1.tiledbsoma\"\n    )\n    path = artifact.path\n    assert \"session\" in path.storage_options\n\n    ctx_factory = SOMAS3ContextFactory(path)\n    assert ctx_factory._refreshable_credentials is not None\n\n    ctx = ctx_factory.get_context()\n    tiledb_config = ctx.tiledb_config\n    assert \"vfs.s3.aws_access_key_id\" in tiledb_config\n    assert \"vfs.s3.aws_secret_access_key\" in tiledb_config\n    assert \"vfs.s3.aws_session_token\" in tiledb_config\n\n    path_str = path.as_posix()\n    # check with managed credentials\n    with tiledbsoma.Experiment.open(path_str, mode=\"r\", context=ctx) as store:\n        assert _soma_store_n_observations(store) == 3\n    # check with anon, s3://lamindata is public\n    with _open_tiledbsoma(ln.UPath(path_str, anon=True), mode=\"r\") as store:\n        assert _soma_store_n_observations(store) == 3\n    # pass credentials manually\n    key = tiledb_config[\"vfs.s3.aws_access_key_id\"]\n    secret = tiledb_config[\"vfs.s3.aws_secret_access_key\"]\n    token = tiledb_config[\"vfs.s3.aws_session_token\"]\n    with _open_tiledbsoma(\n        ln.UPath(path_str, key=key, secret=secret, token=token), mode=\"r\"\n    ) as store:\n        assert _soma_store_n_observations(store) == 3\n"
  }
]