Repository: laminlabs/lamindb
Branch: main
Commit: 44563e03eeae
Files: 288
Total size: 2.7 MB

Directory structure:
gitextract_xlz91t15/

├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.yml
│   │   ├── config.yml
│   │   ├── enhancement.yml
│   │   └── usage_question.yml
│   └── workflows/
│       ├── build.yml
│       └── doc-changes.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── docs/
│   ├── api.md
│   ├── arrays.md
│   ├── bionty.md
│   ├── changelog.md
│   ├── curate.md
│   ├── faq/
│   │   ├── acid.md
│   │   ├── curate-any.md
│   │   ├── idempotency.md
│   │   ├── import-modules.md
│   │   ├── keep-artifacts-local.md
│   │   ├── pydantic-pandera.md
│   │   ├── reference-field.md
│   │   ├── search.md
│   │   ├── symbol-mapping.md
│   │   ├── test_notebooks.py
│   │   ├── track-run-inputs.md
│   │   ├── trash-archive.md
│   │   └── validate-fields.md
│   ├── faq.md
│   ├── guide.md
│   ├── index.md
│   ├── lightning.md
│   ├── manage-changes.md
│   ├── manage-ontologies.md
│   ├── organize.md
│   ├── pertdb.md
│   ├── query-search.md
│   ├── registries.md
│   ├── scripts/
│   │   ├── curate_anndata_flexible.py
│   │   ├── curate_anndata_uns.py
│   │   ├── curate_dataframe_attrs.py
│   │   ├── curate_dataframe_external_features.py
│   │   ├── curate_dataframe_flexible.py
│   │   ├── curate_dataframe_minimal_errors.py
│   │   ├── curate_dataframe_union_features.py
│   │   ├── curate_mudata.py
│   │   ├── curate_soma_experiment.py
│   │   ├── curate_spatialdata.py
│   │   ├── define_schema_anndata_uns.py
│   │   ├── define_schema_df_metadata.py
│   │   ├── define_schema_spatialdata.py
│   │   ├── my_workflow.py
│   │   ├── my_workflow_with_click.py
│   │   ├── my_workflow_with_step.py
│   │   ├── run_script_with_step.py
│   │   ├── run_track_and_finish.py
│   │   ├── run_track_with_features_and_params.py
│   │   ├── run_track_with_params.py
│   │   └── synced_with_git.py
│   ├── storage/
│   │   ├── add-replace-cache.ipynb
│   │   ├── anndata-accessor.ipynb
│   │   ├── prepare-sync-local-to-cloud.ipynb
│   │   ├── sync-local-to-cloud.ipynb
│   │   ├── test-files/
│   │   │   └── iris.data
│   │   ├── test_notebooks.py
│   │   ├── upload.ipynb
│   │   └── vitessce.ipynb
│   ├── storage.md
│   ├── sync.md
│   ├── test_notebooks.py
│   └── track.md
├── lamindb/
│   ├── __init__.py
│   ├── _finish.py
│   ├── _secret_redaction.py
│   ├── _view.py
│   ├── base/
│   │   ├── __init__.py
│   │   ├── dtypes.py
│   │   ├── fields.py
│   │   ├── ids.py
│   │   ├── types.py
│   │   ├── uids.py
│   │   ├── users.py
│   │   └── utils.py
│   ├── core/
│   │   ├── __init__.py
│   │   ├── _compat.py
│   │   ├── _context.py
│   │   ├── _functions.py
│   │   ├── _mapped_collection.py
│   │   ├── _settings.py
│   │   ├── _sync_git.py
│   │   ├── _track_environment.py
│   │   ├── exceptions.py
│   │   ├── loaders.py
│   │   ├── storage/
│   │   │   ├── __init__.py
│   │   │   ├── _anndata_accessor.py
│   │   │   ├── _backed_access.py
│   │   │   ├── _polars_lazy_df.py
│   │   │   ├── _pyarrow_dataset.py
│   │   │   ├── _spatialdata_accessor.py
│   │   │   ├── _tiledbsoma.py
│   │   │   ├── _valid_suffixes.py
│   │   │   ├── _zarr.py
│   │   │   ├── objects.py
│   │   │   ├── paths.py
│   │   │   └── types.py
│   │   └── subsettings/
│   │       ├── __init__.py
│   │       ├── _annotation_settings.py
│   │       └── _creation_settings.py
│   ├── curators/
│   │   ├── __init__.py
│   │   └── core.py
│   ├── errors.py
│   ├── examples/
│   │   ├── __init__.py
│   │   ├── cellxgene/
│   │   │   ├── __init__.py
│   │   │   └── _cellxgene.py
│   │   ├── croissant/
│   │   │   ├── __init__.py
│   │   │   └── mini_immuno.anndata.zarr_metadata.json
│   │   ├── datasets/
│   │   │   ├── __init__.py
│   │   │   ├── _core.py
│   │   │   ├── _fake.py
│   │   │   ├── _small.py
│   │   │   ├── define_mini_immuno_features_labels.py
│   │   │   ├── define_mini_immuno_schema_flexible.py
│   │   │   ├── mini_immuno.py
│   │   │   └── save_mini_immuno_datasets.py
│   │   ├── fixtures/
│   │   │   ├── __init__.py
│   │   │   └── sheets.py
│   │   ├── mlflow/
│   │   │   └── __init__.py
│   │   ├── schemas/
│   │   │   ├── __init__.py
│   │   │   ├── _anndata.py
│   │   │   ├── _simple.py
│   │   │   ├── define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
│   │   │   └── define_valid_features.py
│   │   └── wandb/
│   │       └── __init__.py
│   ├── integrations/
│   │   ├── __init__.py
│   │   ├── _croissant.py
│   │   ├── _vitessce.py
│   │   └── lightning.py
│   ├── migrations/
│   │   ├── 0177_squashed.py
│   │   ├── 0178_v2_2.py
│   │   ├── 0179_v2_2_part_2.py
│   │   ├── 0180_v2_2_part_3.py
│   │   ├── 0181_v2_2_part_4.py
│   │   ├── 0182_v2_2_part_5.py
│   │   ├── 0183_v2_2_part_6.py
│   │   ├── 0184_alter_transformrecord_feature.py
│   │   ├── 0185_alter_runrecord_feature.py
│   │   ├── 0186_v2_4.py
│   │   ├── 0187_squashed.py
│   │   ├── 0187_v2_4_part_2.py
│   │   ├── README.md
│   │   └── __init__.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── _describe.py
│   │   ├── _django.py
│   │   ├── _feature_manager.py
│   │   ├── _from_values.py
│   │   ├── _is_versioned.py
│   │   ├── _label_manager.py
│   │   ├── _relations.py
│   │   ├── _run_cleanup.py
│   │   ├── artifact.py
│   │   ├── artifact_set.py
│   │   ├── block.py
│   │   ├── can_curate.py
│   │   ├── collection.py
│   │   ├── feature.py
│   │   ├── has_parents.py
│   │   ├── project.py
│   │   ├── query_manager.py
│   │   ├── query_set.py
│   │   ├── record.py
│   │   ├── run.py
│   │   ├── save.py
│   │   ├── schema.py
│   │   ├── sqlrecord.py
│   │   ├── storage.py
│   │   ├── transform.py
│   │   └── ulabel.py
│   ├── py.typed
│   └── setup/
│       ├── __init__.py
│       ├── _merge.py
│       ├── _switch.py
│       ├── core/
│       │   └── __init__.py
│       ├── errors/
│       │   └── __init__.py
│       └── types/
│           └── __init__.py
├── lamindb_full.py
├── noxfile.py
├── pyproject.full.toml
├── pyproject.toml
├── scripts/
│   └── migrate_test_instances.py
└── tests/
    ├── core/
    │   ├── _dataset_fixtures.py
    │   ├── conftest.py
    │   ├── notebooks/
    │   │   ├── basic-r-notebook.Rmd.cleaned.html
    │   │   ├── basic-r-notebook.Rmd.html
    │   │   ├── duplicate/
    │   │   │   └── with-title-initialized-consecutive-finish.ipynb
    │   │   ├── load_schema.ipynb
    │   │   ├── no-title.ipynb
    │   │   ├── with-title-initialized-consecutive-finish-not-last-cell.ipynb
    │   │   └── with-title-initialized-consecutive-finish.ipynb
    │   ├── scripts/
    │   │   ├── duplicate1/
    │   │   │   └── script-to-test-versioning.py
    │   │   ├── duplicate2/
    │   │   │   └── script-to-test-versioning.py
    │   │   ├── duplicate3/
    │   │   │   └── script-to-test-versioning.py
    │   │   ├── duplicate4/
    │   │   │   └── script-to-test-versioning.py
    │   │   ├── duplicate5/
    │   │   │   └── script-to-test-versioning.py
    │   │   ├── script-to-test-filename-change.py
    │   │   └── script-to-test-versioning.py
    │   ├── test_artifact_anndata_with_curation.py
    │   ├── test_artifact_basics.py
    │   ├── test_artifact_dataframe_with_curation.py
    │   ├── test_artifact_describe_to_dataframe.py
    │   ├── test_artifact_features_annotations.py
    │   ├── test_artifact_parquet.py
    │   ├── test_blocks.py
    │   ├── test_branches.py
    │   ├── test_can_curate.py
    │   ├── test_collection.py
    │   ├── test_curator_basics.py
    │   ├── test_data_migrations.py
    │   ├── test_db.py
    │   ├── test_delete.py
    │   ├── test_feature.py
    │   ├── test_feature_dtype.py
    │   ├── test_from_values.py
    │   ├── test_has_parents.py
    │   ├── test_has_type.py
    │   ├── test_integrity.py
    │   ├── test_is_versioned.py
    │   ├── test_label_manager.py
    │   ├── test_load.py
    │   ├── test_manager.py
    │   ├── test_merge.py
    │   ├── test_nbconvert.py
    │   ├── test_notebooks.py
    │   ├── test_querydb.py
    │   ├── test_queryset.py
    │   ├── test_record_basics.py
    │   ├── test_record_sheet_examples.py
    │   ├── test_rename_features_labels.py
    │   ├── test_run.py
    │   ├── test_save.py
    │   ├── test_schema.py
    │   ├── test_search.py
    │   ├── test_settings.py
    │   ├── test_sqlrecord.py
    │   ├── test_storage.py
    │   ├── test_switch.py
    │   ├── test_track_flow.py
    │   ├── test_track_script_or_notebook.py
    │   ├── test_track_step.py
    │   ├── test_transform.py
    │   ├── test_transform_from_git.py
    │   └── test_view.py
    ├── curators/
    │   ├── conftest.py
    │   ├── test_cellxgene_curation.py
    │   ├── test_curate_from_croissant.py
    │   ├── test_curators_examples.py
    │   ├── test_curators_remote.py
    │   └── test_dataframe_curation.py
    ├── integrations/
    │   ├── conftest.py
    │   └── test_lightning.py
    ├── no_instance/
    │   ├── conftest.py
    │   ├── test_connect_dynamic_import.py
    │   ├── test_import_side_effects.py
    │   └── test_no_default_instance.py
    ├── permissions/
    │   ├── conftest.py
    │   ├── jwt_utils.py
    │   ├── scripts/
    │   │   ├── check_lamin_dev.py
    │   │   ├── setup_access.py
    │   │   └── setup_instance.py
    │   └── test_rls_dbwritelog.py
    ├── profiling/
    │   ├── import_lamindb.py
    │   ├── import_lamindb_and_connect.py
    │   ├── import_lamindb_core_storage.py
    │   └── import_records_from_dataframe.py
    ├── storage/
    │   ├── conftest.py
    │   ├── test_artifact_storage.py
    │   ├── test_artifact_zarr.py
    │   ├── test_cache.py
    │   ├── test_connect_reconnect.py
    │   ├── test_storage_lifecycle.py
    │   ├── test_streaming.py
    │   └── test_transfer.py
    └── tiledbsoma/
        ├── conftest.py
        ├── test_artifact_basics.py
        ├── test_curators.py
        └── test_storage.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.yml
================================================
name: Report a bug
description: Report a bug.
labels:
  - ":bug: bug"
body:
  - type: textarea
    id: report
    attributes:
      label: Add a description
      placeholder: |
        Describe and consider providing version information. Please ensure you're on the latest version of lamindb.
        This is a public repository!
        Do not reveal any internal information.
    validations:
      required: true


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: true
contact_links:
    - name: LaminHub issues
      url: https://github.com/laminlabs/laminhub-public
      about: If you have issues with the GUI/web app at lamin.ai, please report them here.
    - name: Enterprise support
      url: https://lamin.ai/contact
      about: If you have other questions, contact us directly.


================================================
FILE: .github/ISSUE_TEMPLATE/enhancement.yml
================================================
name: Propose an enhancement
description: Propose an enhancement.
body:
  - type: textarea
    id: description
    attributes:
      label: Add a description
      placeholder: |
        This is a public repository!
        Do not reveal any internal information.
    validations:
      required: true


================================================
FILE: .github/ISSUE_TEMPLATE/usage_question.yml
================================================
name: Ask a usage question
description: Ask a usage question.
labels:
  - "usage question"
body:
  - type: textarea
    id: description
    attributes:
      label: Add a description
      placeholder: |
        This is a public repository!
        Do not reveal any internal information.
    validations:
      required: true


================================================
FILE: .github/workflows/build.yml
================================================
name: build

on:
  push:
    branches: [release]
  pull_request:

jobs:
  pre-filter:
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
      - uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - uses: dorny/paths-filter@v3
        id: changes
        if: github.event_name != 'push'
        with:
          filters: |
            curator:
              - 'lamindb/curators/**'
              - 'lamindb/examples/cellxgene/**'
              - 'tests/curators/**'
            integrations:
              - 'lamindb/integrations/**'
              - 'tests/integrations/**'

      - id: set-matrix
        shell: bash
        run: |
          BASE_GROUPS=$(jq -n -c '["unit-core-sqlite", "unit-core-postgres", "unit-storage", "tutorial", "guide", "tiledbsoma", "biology", "faq", "storage", "cli", "permissions", "no-instance"]')
          ADDITIONAL_GROUPS=[]

          if [[ "${{ github.event_name }}" == "push" || "${{ github.event_name }}" == "repository_dispatch" ]]; then
            # Run everything on push and dispatch
            ADDITIONAL_GROUPS=$(jq -n -c '["curator", "integrations"]')
          else
            # Otherwise check which paths changed
            if [[ "${{ steps.changes.outputs.curator }}" == "true" ]]; then
              ADDITIONAL_GROUPS=$(jq -n -c --argjson groups "$ADDITIONAL_GROUPS" '$groups + ["curator"]')
            fi
            if [[ "${{ steps.changes.outputs.integrations }}" == "true" ]]; then
              ADDITIONAL_GROUPS=$(jq -n -c --argjson groups "$ADDITIONAL_GROUPS" '$groups + ["integrations"]')
            fi
          fi

          # Combine base groups with any additional groups
          MATRIX=$(jq -n -c --argjson base "$BASE_GROUPS" --argjson additional "$ADDITIONAL_GROUPS" '{group: ($base + $additional)}')

          # Output as single line for GitHub Actions
          echo "matrix=$(echo "$MATRIX" | jq -c .)" >> $GITHUB_OUTPUT

          # Pretty print for debugging
          echo "Generated matrix:"
          echo "$MATRIX" | jq .

  test:
    needs: pre-filter
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix: ${{fromJson(needs.pre-filter.outputs.matrix)}}
    timeout-minutes: 20
    steps:
      - uses: actions/checkout@v6
        with:
          submodules: recursive
          fetch-depth: 0

      - uses: actions/checkout@v6
        if: ${{ matrix.group == 'permissions' }}
        with:
          repository: laminlabs/laminhub
          token: ${{ secrets.GH_TOKEN_DEPLOY_LAMINAPP }}
          path: laminhub
          ref: main

      - uses: actions/setup-python@v6
        with:
          python-version: |
            ${{ matrix.group == 'tiledbsoma' && '3.13' ||
                matrix.group == 'permissions' && '3.14' ||
                github.ref == 'refs/heads/release' && '3.11' ||
                '3.14'
                }}

      - name: cache pre-commit
        uses: actions/cache@v4
        with:
          path: ~/.cache/pre-commit
          key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}

      - name: cache postgres
        if: ${{ matrix.group == 'faq' || matrix.group == 'unit-core-postgres' || matrix.group == 'unit-storage' || matrix.group == 'permissions'}}
        id: cache-postgres
        uses: actions/cache@v4
        with:
          path: ~/postgres.tar
          key: cache-postgres-0
          restore-keys: |
            cache-postgres-
      - name: cache postgres miss
        if: ${{ (matrix.group == 'faq' || matrix.group == 'unit-core-postgres' || matrix.group == 'unit-storage' || matrix.group == 'permissions') && steps.cache-postgres.outputs.cache-hit != 'true' }}
        run: docker pull postgres:latest && docker image save postgres:latest --output ~/postgres.tar
      - name: cache postgres use
        if: ${{ (matrix.group == 'faq' || matrix.group == 'unit-core-postgres' || matrix.group == 'unit-storage' || matrix.group == 'permissions') && steps.cache-postgres.outputs.cache-hit == 'true' }}
        run: docker image load --input ~/postgres.tar

      - run: pip install "laminci@git+https://github.com/laminlabs/laminci"

      - run: nox -s configure_coverage -- '${{needs.pre-filter.outputs.matrix}}'

      - name: install postgres
        if: ${{ matrix.group == 'faq' }}
        run: sudo apt-get install libpq-dev

      - name: install graphviz
        if: ${{ matrix.group == 'tutorial' || matrix.group == 'guide' || matrix.group == 'biology' || matrix.group == 'faq'}}
        run: sudo apt-get -y install graphviz

      # - run: nox -s lint
      #   if: ${{ matrix.group == 'tutorial' }}

      - run: nox -s "install_ci(group='${{ matrix.group }}')"

      - uses: aws-actions/configure-aws-credentials@v4
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: us-east-1
      - run: nox -s prepare
        if: ${{ !startsWith(matrix.group, 'unit-') && !startsWith(matrix.group, 'permissions') }}
      - run: nox -s "test(group='${{ matrix.group }}')"

      - name: upload coverage
        uses: actions/upload-artifact@v4
        with:
          name: coverage--${{ matrix.group }}
          path: .coverage
          include-hidden-files: true

      - name: upload docs
        if: ${{ matrix.group == 'tutorial' || matrix.group == 'guide' || matrix.group == 'tiledbsoma' || matrix.group == 'biology' || matrix.group == 'faq' || matrix.group == 'storage' }}
        uses: actions/upload-artifact@v4
        with:
          name: docs-${{ matrix.group }}
          path: ./docs/${{ matrix.group }}

  profile:
    runs-on: ubuntu-latest
    timeout-minutes: 10
    env:
      LAMIN_API_KEY: ${{ secrets.LAMIN_API_KEY_TESTUSER1 }}
    steps:
      - uses: actions/checkout@v6
        with:
          submodules: recursive
          fetch-depth: 0
      - uses: actions/setup-python@v6
        with:
          python-version: |
            ${{ github.ref == 'refs/heads/release' && '3.11' ||
                '3.14'
                }}
      - run: pip install git+https://github.com/laminlabs/laminci
      - run: nox -s "install_ci(group='unit-core-sqlite')"
      - run: uv pip install --system git+https://github.com/laminlabs/laminprofiler
      - run: lamin login
      - run: laminprofiler check tests/profiling/import_lamindb_and_connect.py --threshold 3.5
      - run: lamin connect laminlabs/lamindata
      - run: laminprofiler check tests/profiling/import_lamindb.py --threshold 1.5
      - run: laminprofiler check tests/profiling/import_lamindb_core_storage.py --threshold 1.5

  docs:
    needs: test
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
        with:
          submodules: recursive
          fetch-depth: 0

      - name: checkout lndocs
        uses: actions/checkout@v6
        with:
          repository: laminlabs/lndocs
          ssh-key: ${{ secrets.READ_LNDOCS }}
          path: lndocs
          ref: main

      - uses: aws-actions/configure-aws-credentials@v4
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: us-east-1

      - uses: actions/setup-python@v6
        with:
          python-version: "3.12"
      - run: pip install "laminci@git+https://x-access-token:${{ secrets.LAMIN_BUILD_DOCS }}@github.com/laminlabs/laminci"
      - run: nox -s "install_ci(group='docs')"
      - uses: actions/download-artifact@v4
      - run: nox -s clidocs
      - run: nox -s prepare
      - run: nox -s docs
      - run: rm -r ./_build/html/.doctrees # do not want to deploy with cloudflare
      - uses: cloudflare/wrangler-action@v3
        id: cloudflare
        with:
          apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
          accountId: 472bdad691b4483dea759eadb37110bd
          command: pages deploy "_build/html" --project-name=lamindb
          gitHubToken: ${{ secrets.GITHUB_TOKEN }}
      - uses: edumserrano/find-create-or-update-comment@v2
        if: github.event_name == 'pull_request'
        with:
          issue-number: ${{ github.event.pull_request.number }}
          body-includes: "Deployment URL"
          comment-author: "github-actions[bot]"
          body: |
            Deployment URL: ${{ steps.cloudflare.outputs.deployment-url }}
          edit-mode: replace

      - uses: peter-evans/repository-dispatch@v2
        if: ${{ github.event_name == 'push' }}
        with:
          token: ${{ secrets.LAMIN_BUILD_DOCS }}
          repository: "laminlabs/lamin-docs"
          event-type: build

  coverage:
    needs: test
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-python@v6
        with:
          python-version: "3.14"
      - run: |
          python -m pip install -U uv
          uv pip install --system coverage[toml]
          uv pip install --system --no-deps .

      - uses: actions/download-artifact@v4
      - name: run coverage
        run: |
          coverage combine coverage--*/.coverage*
          coverage report --fail-under=0
          coverage xml
      - uses: codecov/codecov-action@v2
        with:
          token: ${{ secrets.CODECOV_TOKEN }}

  dispatch:
    if: ${{ github.event_name == 'push' }}
    runs-on: ubuntu-latest
    steps:
      - uses: peter-evans/repository-dispatch@v2
        with:
          token: ${{ secrets.LAMIN_BUILD_DOCS }}
          repository: "laminlabs/lamindb-dispatch"
          event-type: build


================================================
FILE: .github/workflows/doc-changes.yml
================================================
name: doc-changes

on:
  pull_request_target:
    branches:
      - main
      - release
    types:
      - closed

jobs:
  doc-changes:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - run: pip install "laminci[doc-changes]@git+https://x-access-token:${{ secrets.LAMIN_BUILD_DOCS }}@github.com/laminlabs/laminci"
      - run: laminci doc-changes
        env:
          repo_token: ${{ secrets.GITHUB_TOKEN }}
          docs_token: ${{ secrets.LAMIN_BUILD_DOCS }}
          changelog_file: lamin-docs/docs/changelog/soon/lamindb.md


================================================
FILE: .gitignore
================================================
__MACOSX/

# LaminDB
README_stripped.md
docs/scripts/test_artifact_parquet.py
README.ipynb
docs/sample.fasta
docs/faq/sample.fasta
docs/faq/test-acid/
docs/scripts/define_mini_immuno_features_labels.py
docs/scripts/define_mini_immuno_schema_flexible.py
docs/scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
docs/scripts/define_valid_features.py
docs/scripts/save_mini_immuno_datasets.py
profile_output*
docs/cli.md
.coveragerc
*.db
*.lndb
*.jpg
*.zarr/
docsbuild/
docs/lamin.md
docs/guide/data-validation.ipynb
docs/guide/bionty.ipynb
docs/guide/lnschema-core.ipynb
docs/paradisi05_laminopathic_nuclei.jpg
bionty_docs/
lamindb_docs/
_build
mydata/
lamin-intro/
lamin-tutorial/
mytest/
rds/
mydb/
docs/test-registries/
docs/test-annotate-flexible/
docs/lamindb.*
lamin_sphinx
docs/conf.py
lamindb/setup/.env
_secrets.py
_configuration.py
lamin.db
docs/generated/*
_docs_tmp*
docs/guide/Laminopathic_nuclei.jpg
docs/guide/paradisi05_laminopathic_nuclei.jpg
nocodb
docs/guide/SRR4238351_subsamp.fastq.gz
docs/faq/paradisi05_laminopathic_nuclei.jpg
docs/faq/tostore/
docs/faq/mydata_postgres/
docs/guide/myobjects/
docs/faq/test-run-inputs/
docs/intro/paradisi05_laminopathic_nuclei.jpg
docs/guide/figures/
docs/test-annotate/
docs/test-track/
suo22/
docs/biology/test-flow/
docs/biology/test-scrna/
docs/biology/test-registries/
docs/biology/test-multimodal/
default_storage
default_storage_unit_core
default_storage_unit_storage
test.ipynb
test2.ipynb
run-tests
test-django-validation/
curate.tiledbsoma
small_dataset.tiledbsoma
nonregistered_storage
registered_storage
tests/core/notebooks/no-uid-renamed.ipynb

# General
.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# ruff
.ruff_cache

# Pyre type checker
.pyre/

# data files
data/
_build
*.csv
*.fcs
*.zip
*.feather
*.h5ad
*.h5mu
*.parquet
*.bam
*.fastq.gz
*.pt

# Pycharm
.idea

# VSCode
.vscode

# CELLxGENE
!lamindb/examples/cellxgene/cellxgene_schema_versions.csv

# ml
lightning_logs
mlruns
download_mnist
checkpoints
test_lightning


================================================
FILE: .gitmodules
================================================
[submodule "sub/lamindb-setup"]
	path = sub/lamindb-setup
	url = https://github.com/laminlabs/lamindb-setup
[submodule "sub/lamin-cli"]
	path = sub/lamin-cli
	url = https://github.com/laminlabs/lamin-cli
[submodule "sub/bionty"]
	path = sub/bionty
	url = https://github.com/laminlabs/bionty
[submodule "sub/pertdb"]
	path = sub/pertdb
	url = https://github.com/laminlabs/pertdb
[submodule "sub/cellxgene-lamin"]
	path = sub/cellxgene-lamin
	url = https://github.com/laminlabs/cellxgene-lamin.git


================================================
FILE: .pre-commit-config.yaml
================================================
fail_fast: false
default_language_version:
  python: python3
default_stages:
  - pre-commit
  - pre-push
minimum_pre_commit_version: 2.16.0
repos:
  - repo: https://github.com/rbubley/mirrors-prettier
    rev: v3.5.1
    hooks:
      - id: prettier
        exclude: |
          (?x)(
            docs/changelog.md|.github/ISSUE_TEMPLATE/config.yml|tests/core/notebooks/basic-r-notebook.Rmd.cleaned.html|README.md
          )
  - repo: https://github.com/kynan/nbstripout
    rev: 0.8.1
    hooks:
      - id: nbstripout
        exclude: |
          (?x)(
              docs/examples/|
              docs/notes/
          )
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.9.10
    hooks:
      - id: ruff
        args: [--fix, --exit-non-zero-on-fix, --unsafe-fixes]
      - id: ruff-format
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.5.0
    hooks:
      - id: detect-private-key
      - id: check-ast
      - id: end-of-file-fixer
        exclude: |
          (?x)(
              .github/workflows/latest-changes.jinja2
            )
      - id: mixed-line-ending
        args: [--fix=lf]
      - id: trailing-whitespace
        exclude: |
          (?x)(
              tests/core/notebooks/basic-r-notebook.Rmd.cleaned.html
            )
      - id: check-case-conflict
  - repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.14.1
    hooks:
      - id: mypy
        args:
          [
            --no-strict-optional,
            --ignore-missing-imports,
            --disable-error-code=annotation-unchecked,
            --disable-error-code=type-arg,
            --namespace-packages,
            --explicit-package-bases,
          ]
        additional_dependencies:
          ["types-requests", "types-attrs", "types-PyYAML"]
        exclude: |
          (?x)(
              test_notebooks.py|
              script-to-test-versioning.py|
              tests/storage/conftest.py|
              tests/curators/conftest.py|
              tests/permissions/conftest.py|
              tests/writelog/conftest.py|
              tests/writelog_sqlite/conftest.py|
              tests/curators/test_curators_examples.py|
              tests/core/conftest.py|
              docs/scripts/
          )


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing

Contributions are generally welcome. Please make an issue to discuss proposals.

## Installation

### PyPI

For installation from PyPI, see [docs.lamin.ai/setup](https://docs.lamin.ai/setup).

### Github

For installation from GitHub, call:

```bash
git clone --recursive https://github.com/laminlabs/lamindb
pip install laminci
python -m venv .venv
source .venv/bin/activate
nox -s install
```

This will install a few dependencies from the git submodules linked [here](https://github.com/laminlabs/lamindb/tree/main/sub), as well as packages
like `pytest` and `pre-commit` that you'll need when developing.

lamindb depends on several other packages that may require modifications for pull requests to successfully pass the continuous integration build.
We suggest the following workflow if commits to any of the submodules are essential for the current modifications in lamindb:

1. Change directory into the submodule that you want to modify: `cd sub/SUBMODULE`.
2. Switch to a new feature branch: `git switch -c feature/NEWFEATURE`.
3. Make a pull request with your changes to the `SUBMODULE` and ensure that the CI passes.
4. In the repository root of lamindb, create a new commit and push:

```bash
cd ..
git add -u
git commit -m "Upgraded SUBMODULE"
git push
```

Any pull request of yours should now also have the changes of the submodule included allowing you to test that changes in the submodule and lamindb are compatible.

## Running and writing tests

This package uses the [pytest][] for automated testing.
Please add a test for every function added to the package.

Running tests requires the [Docker daemon][] up, then run at the root of the repository:

```bash
pytest --ignore=tests/storage --ignore=tests/permission
```

in the root of the repository.
We exclude specific directories in local `pytest` runs because they directly access external resources such as AWS, which require specific access keys.
Continuous integration will automatically run **all** tests on pull requests.

## Code-style

This project uses [pre-commit][] to enforce consistent code-styles. On every commit, pre-commit checks will either
automatically fix issues with the code, or raise an error message.

To enable pre-commit locally, simply run

```bash
pre-commit install
```

in the root of the repository. Pre-commit will automatically download all dependencies when it is run for the first time.

We further use [gitmoji][] to add emoticons to commits.
These allow us to more easily categorize them allowing for faster visual filtering.

It can be installed by running:

```bash
npm i -g gitmoji-cli
```

and enabled for the repository via:

```bash
gitmoji -i
```

If you don't have `sudo` in your working environment, follow [these instructions](https://github.com/sindresorhus/guides/blob/main/npm-global-without-sudo.md).

## Documentation

We build our documentation with an internal tool called `lndocs`.
We have not made it public yet and therefore external contributors need to rely on the Github Actions `docs` job to build the documentation.
If the `docs` job succeeds, a preview URL will be posted automatically as a comment to your pull request.

## Releases

Currently only lamin employees have release rights.

Release publishing is managed via `laminci release --pypi`. For `lamindb`, the
release flow now publishes two distributions in sequence:

- `lamindb-core` (contains the `lamindb/` namespace package)
- `lamindb` (meta-package that depends on `lamindb-core`)

Before first production publish of a version, run a TestPyPI dry run by
building both wheels from `pyproject.toml` and `pyproject.full.toml`, then
uploading with `twine` to TestPyPI for verification.

[Docker daemon]: https://docs.docker.com/engine/install/
[gitmoji]: https://gitmoji.dev/
[pre-commit]: https://pre-commit.com/
[pytest]: https://docs.pytest.org/


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
[![docs](https://img.shields.io/badge/docs-yellow)](https://docs.lamin.ai) [![llms.txt](https://img.shields.io/badge/llms.txt-orange)](https://docs.lamin.ai/llms.txt) [![codecov](https://codecov.io/gh/laminlabs/lamindb/branch/main/graph/badge.svg?token=VKMRJ7OWR3)](https://codecov.io/gh/laminlabs/lamindb) [![pypi](https://img.shields.io/pypi/v/lamindb?color=blue&label=PyPI)](https://pypi.org/project/lamindb) [![cran](https://www.r-pkg.org/badges/version/laminr?color=green)](https://cran.r-project.org/package=laminr) [![stars](https://img.shields.io/github/stars/laminlabs/lamindb?style=flat&logo=GitHub&label=&color=gray)](https://github.com/laminlabs/lamindb) [![downloads](https://static.pepy.tech/personalized-badge/lamindb?period=total&units=INTERNATIONAL_SYSTEM&left_color=GRAY&right_color=GRAY&left_text=%E2%AC%87%EF%B8%8F)](https://pepy.tech/project/lamindb)

# LaminDB - Open-source data framework for biology

LaminDB allows you to query, trace, and validate datasets and models at scale.
You get context & memory through a lineage-native lakehouse that supports bio-formats, registries & ontologies while feeling as simple as a file system.

Agent? [llms.txt](https://docs.lamin.ai/llms.txt)

<details>
<summary>Why?</summary>

(1) Reproducing, tracing & understanding how datasets, models & results are created is critical to quality R&D.
Without context, humans & agents make mistakes and cannot close feedback loops across data generation & analysis.
Without memory, compute & intelligence are wasted on fragmented, non-compounding tasks — LLM context windows are small.

(2) Training & fine-tuning models with thousands of datasets — across LIMS, ELNs, orthogonal assays — is now a primary path to scaling R&D.
But without queryable & validated data or with data locked in organizational & infrastructure silos, it leads to garbage in, garbage out or is quite simply impossible.

Imagine building software without git or pull requests: an agent's actions would be impossible to verify.
While code has git and tables have dbt/warehouses, biological data has lacked a framework for managing its unique complexity.

LaminDB fills the gap.
It is a lineage-native lakehouse that understands bio-registries and formats (`AnnData`, `.zarr`, …) based on the established open data stack:
Postgres/SQLite for metadata and cross-platform storage for datasets.
By offering queries, tracing & validation in a single API, LaminDB provides the context & memory to turn messy, agentic biological R&D into a scalable process.

</details>

<img width="800px" src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/BunYmHkyFLITlM5M000D.svg">

How?

- **lineage** → track inputs & outputs of notebooks, scripts, functions & pipelines with a single line of code
- **lakehouse** → manage, monitor & validate schemas for standard and bio formats; query across many datasets
- **FAIR datasets** → validate & annotate `DataFrame`, `AnnData`, `SpatialData`, `parquet`, `zarr`, …
- **LIMS & ELN** → programmatic experimental design with bio-registries, ontologies & markdown notes
- **unified access** → storage locations (local, S3, GCP, …), SQL databases (Postgres, SQLite) & ontologies
- **reproducible** → auto-track source code & compute environments with data & code versioning
- **change management** → branching & merging similar to git, plan management for agents
- **zero lock-in** → runs anywhere on open standards (Postgres, SQLite, `parquet`, `zarr`, etc.)
- **scalable** → you hit storage & database directly through your `pydata` or R stack, no REST API involved
- **simple** → just `pip install` from PyPI or `install.packages('laminr')` from CRAN
- **distributed** → zero-copy & lineage-aware data sharing across infrastructure (databases & storage locations)
- **integrations** → [git](https://docs.lamin.ai/track#sync-code-with-git), [nextflow](https://docs.lamin.ai/nextflow), [vitessce](https://docs.lamin.ai/vitessce), [redun](https://docs.lamin.ai/redun), and [more](https://docs.lamin.ai/integrations)
- **extensible** → create custom plug-ins based on the Django ORM, the basis for LaminDB's registries

GUI, permissions, audit logs? [LaminHub](https://lamin.ai) is a collaboration hub built on LaminDB similar to how GitHub is built on git.

<details>
<summary>Who?</summary>

Scientists and engineers at leading research institutions and biotech companies, including:

- **Industry** → Pfizer, Altos Labs, Ensocell Therapeutics, ...
- **Academia & Research** → scverse, DZNE (National Research Center for Neuro-Degenerative Diseases), Helmholtz Munich (National Research Center for Environmental Health), ...
- **Research Hospitals** → Global Immunological Swarm Learning Network: Harvard, MIT, Stanford, ETH Zürich, Charité, U Bonn, Mount Sinai, ...

From personal research projects to pharma-scale deployments managing petabytes of data across:

entities | OOMs
--- | ---
observations & datasets | 10¹² & 10⁶
runs & transforms| 10⁹ & 10⁵
proteins & genes | 10⁹ & 10⁶
biosamples & species | 10⁵ & 10²
... | ...

</details>

## Quickstart

To install the Python package with recommended dependencies, use:

```shell
pip install lamindb
```

<details>
<summary>Install with minimal dependencies.</summary>

The `lamindb` package adds data-science related dependencies, those that come with the `[full]` extra, see [here](https://github.com/laminlabs/lamindb/blob/2cc91adcf6077c5af69c1a098699085bb0844083/pyproject.toml#L30-L49).

If you want a maximally lightweight install of the `lamindb` namespace, use:

```shell
pip install lamindb-core
```

This suffices to support the basic functionality but you will get an `ImportError` if you're e.g. trying to validate a `DataFrame` because that requires `pandera`.

</details>

### Query databases & load artifacts

You can browse public databases at [lamin.ai/explore](https://lamin.ai/explore). To query [laminlabs/cellxgene](https://lamin.ai/laminlabs/cellxgene), run:

```python
import lamindb as ln

db = ln.DB("laminlabs/cellxgene")  # a database object for queries
df = db.Artifact.to_dataframe()    # a dataframe listing datasets & models
```

To get a [specific dataset](https://lamin.ai/laminlabs/cellxgene/artifact/BnMwC3KZz0BuKftR), run:

```python
artifact = db.Artifact.get("BnMwC3KZz0BuKftR")  # a metadata object for a dataset
artifact.describe()                             # describe the context of the dataset
```

<details>
<summary>See the output.</summary>
<img src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/mxlUQiRLMU4Zos6k0001.png" width="550">
</details>

Access the content of the dataset via:

```python
local_path = artifact.cache()  # return a local path from a cache
adata = artifact.load()        # load object into memory
accessor = artifact.open()     # return a streaming accessor
```

You can query by biological entities like `Disease` through plug-in `bionty`:

```python
alzheimers = db.bionty.Disease.get(name="Alzheimer disease")
df = db.Artifact.filter(diseases=alzheimers).to_dataframe()
```

### Configure your database

You can create a LaminDB instance at [lamin.ai](https://lamin.ai) and invite collaborators.
To connect to an existing instance, run:

```shell
# log into LaminHub
lamin login
# then either
lamin connect account/name  # connect globally in your environment
# or
lamin connect --here account/name  # connect in your current development directory
```

If you prefer to init a new instance instead (no login required), run:

```shell
lamin init --storage ./quickstart-data --modules bionty
```

For more configuration, read: [docs.lamin.ai/setup](https://docs.lamin.ai/setup).

On the terminal and in a Python session, LaminDB will now auto-connect.

### Save files & folders as artifacts

To save a file or folder via the API:

```python
import lamindb as ln
# → connected lamindb: account/instance

open("sample.fasta", "w").write(">seq1\nACGT\n")        # create dataset
ln.Artifact("sample.fasta", key="sample.fasta").save()  # save dataset
```

To save a file or folder via the CLI, run:

```shell
lamin save sample.fasta --key sample.fasta
```

To load an artifact via the CLI into a local cache, run:

```shell
lamin load --key sample.fasta
```

Read more about the CLI: [docs.lamin.ai/cli](https://docs.lamin.ai/cli).

### Lineage: scripts & notebooks

To create a dataset while tracking source code, inputs, outputs, logs, and environment:

```python
import lamindb as ln
# → connected lamindb: account/instance

ln.track()                                              # track code execution
open("sample.fasta", "w").write(">seq1\nACGT\n")        # create dataset
ln.Artifact("sample.fasta", key="sample.fasta").save()  # save dataset
ln.finish()                                             # mark run as finished
```

Running this snippet as a script (`python create-fasta.py`) produces the following data lineage:

```python
artifact = ln.Artifact.get(key="sample.fasta")  # get artifact by key
artifact.describe()      # context of the artifact
artifact.view_lineage()  # fine-grained lineage
```

<img src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/BOTCBgHDAvwglN3U0004.png" width="550"> <img src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/EkQATsQL5wqC95Wj0006.png" width="140">

Watch a mini video: [youtu.be/jwnHu1PbA9Q](https://youtu.be/jwnHu1PbA9Q)

<details>
<summary>Access run & transform.</summary>

```python
run = artifact.run              # get the run object
transform = artifact.transform  # get the transform object
run.describe()                  # context of the run
```

<img src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/rJrHr3XaITVS4wVJ0000.png" width="550" />

```python
transform.describe()  # context of the transform
```

<img src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/JYwmHBbgf2MRCfgL0000.png" width="550" />

</details>

<details>
<summary>Track a project or an agent plan.</summary>

Pass a project/artifact to `ln.track()`, for example:

```python
ln.track(project="My project", plan="./plans/curate-dataset-x.md")
```

Note that you have to create a project or save the agent plan in case they don't yet exist:

```shell
# create a project with the CLI
lamin create project "My project"

# save an agent plan with the CLI
lamin save /path/to/.cursor/plans/curate-dataset-x.plan.md
lamin save /path/to/.claude/plans/curate-dataset-x.md
```

Or in Python:

```python
ln.Project(name="My project").save()  # create a project in Python
```

</details>


### Lineage: functions & workflows

You can achieve the same traceability for functions & workflows:

<!-- #skip_laminr -->

```python
import lamindb as ln

@ln.flow()
def create_fasta(fasta_file: str = "sample.fasta"):
    open(fasta_file, "w").write(">seq1\nACGT\n")    # create dataset
    ln.Artifact(fasta_file, key=fasta_file).save()  # save dataset

if __name__ == "__main__":
    create_fasta()
```

<!-- #end_skip_laminr -->

Beyond what you get for scripts & notebooks, this automatically tracks function & CLI params and integrates well with established Python workflow managers: [docs.lamin.ai/track](https://docs.lamin.ai/track). To integrate advanced bioinformatics pipeline managers like Nextflow, see [docs.lamin.ai/pipelines](https://docs.lamin.ai/pipelines).

<details>
<summary>A richer example.</summary>

Here is an automatically generated re-construction of the project of [Schmidt _et al._ (Science, 2022)](https://pubmed.ncbi.nlm.nih.gov/35113687/):

<img src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/KQmzmmLOeBN0C8Yk0004.png" width="850">

A phenotypic CRISPRa screening result is integrated with scRNA-seq data. Here is the result of the screen input:

<img src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/JvLaK9Icj11eswQn0000.png" width="850">

You can explore it [here](https://lamin.ai/laminlabs/lamindata/artifact/W1AiST5wLrbNEyVq) on LaminHub or [here](https://github.com/laminlabs/schmidt22) on GitHub.

</details>

### Labeling & queries by fields

You can label an artifact by running:

```python
my_label = ln.ULabel(name="My label").save()   # a universal label
project = ln.Project(name="My project").save() # a project label
artifact.ulabels.add(my_label)
artifact.projects.add(project)
```

Query for it:

```python
ln.Artifact.filter(ulabels=my_label, projects=project).to_dataframe()
```

You can also query by the metadata that lamindb automatically collects:

```python
ln.Artifact.filter(run=run).to_dataframe()              # by creating run
ln.Artifact.filter(transform=transform).to_dataframe()  # by creating transform
ln.Artifact.filter(size__gt=1e6).to_dataframe()         # size greater than 1MB
```

If you want to include more information into the resulting dataframe, pass `include`.

```python
ln.Artifact.to_dataframe(include=["created_by__name", "storage__root"])  # include fields from related registries
```

Note: The query syntax for `DB` objects and for your default database is the same.

### The core data model

Here is an overview that illustrates how `Artifact` links to all other registries:

<img width="800px" src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/HMfWLa1rFkxcxQEN0000.svg">

### Queries by features

You can annotate datasets and samples with features. Let's define some:

```python
from datetime import date

ln.Feature(name="gc_content", dtype=float).save()
ln.Feature(name="experiment_note", dtype=str).save()
ln.Feature(name="experiment_date", dtype=date, coerce=True).save()  # accept date strings
```

During annotation, feature names and data types are validated against these definitions.

```python
artifact.features.set_values({
    "gc_content": 0.55,
    "experiment_note": "Looks great",
    "experiment_date": "2025-10-24",
})
```

Query for it:

```python
ln.Artifact.filter(experiment_date="2025-10-24").to_dataframe()  # query all artifacts annotated with `experiment_date`
```

If you want to include the feature values into the dataframe, pass `include`.

```python
ln.Artifact.to_dataframe(include="features")  # include the feature annotations
```

### Lake ♾️ LIMS ♾️ Sheets

You can create records for the entities underlying your experiments: samples, perturbations, instruments, etc., for example:

```python
ln.Record(name="Sample 1", features={"gc_content": 0.5}).save()
```

You can create relationships of entities:

```python
# create a flexible record type to track experiments
experiment_type = ln.Record(name="Experiment", is_type=True).save()

# create a record of type `Experiment` for your first experiment
ln.Record(name="Experiment 1", type=experiment_type).save()

# create a feature to link experiments in records, dataframes, etc.
ln.Feature(name="experiment", dtype=experiment_type).save()

# create a sample record that links the sample to `Experiment 1` via the `experiment` feature
ln.Record(name="Sample 2", features={"gc_content": 0.5, "experiment": "Experiment 1"}).save()
```

You can convert any record type to dataframe/sheet:

```python
experiment_type.to_dataframe()
```

<details>
<summary>You can edit records like Excel sheets on LaminHub.</summary>
<img width="800px" src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/XSzhWUb0EoHOejiw0001.png">
</details>

### Data versioning

If you change source code or datasets, LaminDB manages versioning for you.
Assume you run a new version of our `create-fasta.py` script to create a new version of `sample.fasta`.

```python
import lamindb as ln

ln.track()
open("sample.fasta", "w").write(">seq1\nTGCA\n")  # a new sequence
ln.Artifact("sample.fasta", key="sample.fasta", features={"experiment": "Experiment 1"}).save()  # annotate with the new experiment
ln.finish()
```

If you now query by `key`, you'll get the latest version of this artifact:

```python
artifact = ln.Artifact.get(key="sample.fasta")  # get artifact by key
artifact.versions.to_dataframe()                # see all versions of that artifact
```

### Change management

To create a contribution branch and switch to it, run:

```shell
lamin switch -c my_branch
```

To merge a contribution branch into `main`, run:

```shell
lamin switch main  # switch to the main branch
lamin merge my_branch  # merge contribution branch into main
```

Read more: [docs.lamin.ai/lamindb.branch](https://docs.lamin.ai/lamindb.branch).

### Data sharing

To share data in a lineage-aware way, sync objects from a source database to your default database:

```python
db = ln.DB("laminlabs/lamindata")
artifact = db.Artifact.get(key="example_datasets/mini_immuno/dataset1.h5ad")
artifact.save()
```

This is zero-copy for the artifact's data in storage. Read more: [docs.lamin.ai/sync](https://docs.lamin.ai/sync).

### Lakehouse ♾️ feature store

Here is how you ingest a `DataFrame`:

```python
import pandas as pd

df = pd.DataFrame({
    "sequence_str": ["ACGT", "TGCA"],
    "gc_content": [0.55, 0.54],
    "experiment_note": ["Looks great", "Ok"],
    "experiment_date": [date(2025, 10, 24), date(2025, 10, 25)],
})
ln.Artifact.from_dataframe(df, key="my_datasets/sequences.parquet").save()  # no validation
```

To validate & annotate the content of the dataframe, use the built-in schema `valid_features`:

```python
ln.Feature(name="sequence_str", dtype=str).save()  # define a remaining feature
artifact = ln.Artifact.from_dataframe(
    df,
    key="my_datasets/sequences.parquet",
    schema="valid_features"  # validate columns against features
).save()
artifact.describe()
```

Watch a mini video: [youtu.be/Ji6E7hTnReQ](https://youtu.be/Ji6E7hTnReQ)

You can filter for datasets by schema and then launch distributed queries and batch loading.

### Lakehouse beyond tables

To validate an `AnnData` with built-in schema `ensembl_gene_ids_and_valid_features_in_obs`, call:

```python
import anndata as ad
import numpy as np
import pandas as pd

adata = ad.AnnData(
    X=np.ones((21, 10)),
    obs=pd.DataFrame({'cell_type_by_model': ['T cell', 'B cell', 'NK cell'] * 7}),
    var=pd.DataFrame(index=[f'ENSG{i:011d}' for i in range(10)])
)
artifact = ln.Artifact.from_anndata(
    adata,
    key="my_datasets/scrna.h5ad",
    schema="ensembl_gene_ids_and_valid_features_in_obs"
).save()
artifact.describe()
```

To validate a `SpatialData` or any other array-like dataset, you need to construct a `Schema`. You can do this by composing simple `pandera`-style schemas: [docs.lamin.ai/curate](https://docs.lamin.ai/curate).

### Ontologies

Plugin `bionty` gives you >20 public ontologies as `SQLRecord` registries. This was used to validate the `ENSG` ids in the `adata` just before.

```python
import bionty as bt

bt.CellType.import_source()  # import the default ontology
bt.CellType.to_dataframe()   # your extensible cell type ontology in a simple registry
```

You can then create objects, e.g. for labeling, analogous to `ULabel`, `Project`, or `Record`:

```python
t_cell = bt.CellType.get(name="T cell")
artifact.cell_types.add(t_cell)
```

Read more: [docs.lamin.ai/manage-ontologies](https://docs.lamin.ai/manage-ontologies).

Watch a mini video: [youtu.be/3vpWjHj3Kw8](https://youtu.be/3vpWjHj3Kw8)

### Save unstructured notes

When in your development directory, you can save markdown files as records:

```shell
lamin save <topic>/<my-note.md>
```


================================================
FILE: docs/api.md
================================================
# API Reference

<meta http-equiv="Refresh" content="0; url=./lamindb.html" />

```{toctree}
:maxdepth: 1
:caption: CLI & lamindb
:hidden:

cli
lamindb
```

```{toctree}
:maxdepth: 1
:caption: Modules
:hidden:

bionty
pertdb
```


================================================
FILE: docs/arrays.md
================================================
---
execute_via: python
---

# Stream datasets from storage

This guide walks through streaming datasets from disk or cloud storage.

```python
# replace with your username and S3 bucket
!lamin login testuser1
!lamin init --storage s3://lamindb-ci/test-arrays
```

Import lamindb and track this notebook.

```python
import lamindb as ln
import numpy as np

ln.track()
db = ln.DB("laminlabs/lamindata")  # we'll pull dataset from there
```

## DataFrame

### Streaming from a single artifact

A dataframe stored as sharded `parquet`.

```python
artifact = db.Artifact.get(key="sharded_parquet")
```

```python
artifact.path.view_tree()
```

```python
dataset = artifact.open()
```

This returns a [pyarrow dataset](https://arrow.apache.org/docs/python/dataset.html).

```python
dataset
```

```python
dataset.head(5).to_pandas()
```

### Streaming from a set of artifacts

You can open several parquet files as a single dataset by calling `.open()` on the result of a query:

```python
dataset = db.Artifact.filter(
    key__startswith="example_datasets/small", suffix=".parquet", is_latest=True
).open()  # open an ArtifactSet for streaming
dataset
```

The same is possible for the artifacts in a collection:

```python
collection = db.Collection.get(key="sharded_parquet_collection")
dataset = collection.open()
dataset
```

Once you have a storage-backed dataset, you can query it like this:

```python
dataset.to_table().to_pandas()
```

By default `Artifact.open()` and `Collection.open()` use `pyarrow` to lazily open dataframes. `polars` can be also used by passing `engine="polars"`. Note also that `.open(engine="polars")` returns a context manager with [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html).

```python
with collection.open(engine="polars", use_fsspec=True) as lazy_df:
    display(lazy_df.collect().to_pandas())
```

## AnnData

We'll need some test data:

```python
ln.Artifact("s3://lamindb-ci/test-arrays/pbmc68k.h5ad").save()
ln.Artifact("s3://lamindb-ci/test-arrays/testfile.hdf5").save()
```

An `h5ad` artifact stored on s3:

```python
artifact = ln.Artifact.get(key="pbmc68k.h5ad")
```

```python
artifact.path
```

```python
adata = artifact.open()
```

This object is an `AnnDataAccessor` object, an `AnnData` object backed in the cloud:

```python
adata
```

Without subsetting, the `AnnDataAccessor` object references underlying lazy `h5` or `zarr` arrays:

```python
adata.X
```

You can subset it like a normal `AnnData` object:

```python
obs_idx = adata.obs.cell_type.isin(["Dendritic cells", "CD14+ Monocytes"]) & (
    adata.obs.percent_mito <= 0.05
)
adata_subset = adata[obs_idx]
adata_subset
```

Subsets load arrays into memory upon direct access:

```python
adata_subset.X
```

To load the entire subset into memory as an actual `AnnData` object, use `to_memory()`:

```python
adata_subset.to_memory()
```

It is also possible to add columns to `.obs` and `.var` of cloud AnnData objects without downloading them. First, create a new `AnnData` `zarr` artifact:

```python
adata_subset.to_memory().write_zarr("adata_subset.zarr")
artifact = ln.Artifact(
    "adata_subset.zarr", description="test add column to adata"
).save()
```

This is how you add a column:

```python
with artifact.open(mode="r+") as adata_accessor:
    adata_accessor.add_column(where="obs", col_name="ones", col=np.ones(adata_accessor.shape[0]))
    display(adata_accessor)
```

The version of the artifact is updated after the modification.

```python
artifact
```

```python
artifact.delete(permanent=True)
```

## SpatialData

It is also possible to access `AnnData` objects inside `SpatialData` `tables`:

```python
artifact = ln.Artifact.connect("laminlabs/lamindata").get(
    key="visium_aligned_guide_min.zarr"
)

access = artifact.open()
```

```python
access
```

```python
access.tables
```

This gives you the same `AnnDataAccessor` object as for a normal `AnnData`.

```python
table = access.tables["table"]

table
```

You can subset it and read into memory as an actual `AnnData`:

```python
table_subset = table[table.obs["clone"] == "diploid"]

table_subset
```

<!-- #region -->

```python
adata = table_subset.to_memory()
```

<!-- #endregion -->

## Generic HDF5

Let us query a generic HDF5 artifact:

```python
artifact = ln.Artifact.get(key="testfile.hdf5")
```

And get a backed accessor:

```python
backed = artifact.open()
```

The returned object contains the `.connection` and `h5py.File` or `zarr.Group` in `.storage`

```python
backed
```

```python
backed.storage
```

```python
# clean up test instance
ln.setup.delete("test-arrays", force=True)
```


================================================
FILE: docs/bionty.md
================================================
# `bionty`

```{eval-rst}
.. automodule:: bionty
```


================================================
FILE: docs/changelog.md
================================================
# Changelog

Actual content in lamin-docs.


================================================
FILE: docs/curate.md
================================================
---
execute_via: python
---

# Validate & standardize datasets

Data curation with LaminDB ensures your datasets are **validated** and **queryable** through **annotation**.

```{raw} html
<iframe width="560" height="315" src="https://www.youtube.com/embed/Ji6E7hTnReQ?si=K0OnU2MTGv4fIhFo" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
```

Curating a dataset with LaminDB means three things:

- **Validate** that the dataset matches a desired schema.
- **Standardize** the dataset (e.g., by fixing typos, mapping synonyms) or update registries if validation fails.
- **Annotate** the dataset by linking it against metadata entities so that it becomes queryable.

In this guide we'll curate common data structures.
Here is a [guide](/faq/curate-any) for the underlying low-level API.

Note: If you know either `pydantic` or `pandera`, here is an [FAQ](/faq/pydantic-pandera) that compares LaminDB with both of these tools.

```python
# pip install lamindb
!lamin init --storage ./test-curate --modules bionty
```

```python
import lamindb as ln

ln.track()
```

<!-- #region -->

## Schema design patterns

A {class}`~lamindb.Schema` in LaminDB is a specification that defines the expected structure, data types, and validation rules for a dataset.
It is similar to `pydantic.Model` for dictionaries, and `pandera.Schema`, and `pyarrow.lib.Schema` for tables, but supporting more complicated data structures.

Schemas ensure data consistency by defining:

- What {class}`~lamindb.Feature`s (dimensions) exist in your dataset
- What data types those features should have
- What values are valid for categorical features
- Which {class}`~lamindb.Feature`s are required vs optional

An exemplary schema:

```python
schema = ln.Schema(
    name="experiment_schema",           # human-readable name
    features=[                          # required features
        ln.Feature(name="cell_type", dtype=bt.CellType),
        ln.Feature(name="treatment", dtype=str),
    ],
    otype="DataFrame"                   # object type (DataFrame, AnnData, etc.)
)
```

For composite data structures using slots:

```{dropdown} What are slots?

For composite data structures, you need to specify which component contains which schema, for example, to validate both cell metadata in `.obs` and gene metadata in `.var` within the same schema.
Each slot is a key like `"obs"` for AnnData observations,`"rna:var"` for MuData modalities, or `"attrs:nested:key"` for SpatialData annotations.
```

```python
# AnnData with multiple "slots"
adata_schema = ln.Schema(
    otype="AnnData",
    slots={
        "obs": cell_metadata_schema,     # cell annotations
        "var.T": gene_id_schema          # gene-derived features
    }
)
```

Before diving into curation, let's understand the different schema approaches and when to use each one.
Think of schemas as rules that define what valid data should look like.

<!-- #endregion -->

### Flexible schema

Use when: You want to validate those columns whose names match feature names in your `Feature` registry.

```{eval-rst}
.. literalinclude:: scripts/define_valid_features.py
   :language: python
```

### Minimal required schema

Use when: You need certain columns but want flexibility for additional metadata.

```{eval-rst}
.. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
   :language: python
```

<!-- #region -->

### Strict Schema

Use when: You need complete control over data structure and values.

```python
# Only allows specified columns
schema = ln.Schema(
    features=[...],
    minimal_set=True,  # whether all passed features are required
    maximal_set=False  # whether additional features are allowed
)
```

<!-- #endregion -->

## DataFrame

### Step 1: Load and examine your data

We'll be working with the mini immuno dataset:

```python
df = ln.examples.datasets.mini_immuno.get_dataset1(
    with_cell_type_synonym=True, with_cell_type_typo=True
)
df
```

### Step 2: Set up your metadata registries

Before creating a schema, ensure your registries have the right features and labels:

```{eval-rst}
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
   :language: python
```

### Step 3: Create your schema

```python
schema = ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible()
schema.describe()
```

<!-- #region -->

### Step 4: Initialize Curator and first validation

If you expect the validation to pass, you can directly register an artifact by providing the schema:

```python

artifact = ln.Artifact.from_dataframe(df, key="examples/my_curated_dataset.parquet", schema=schema).save()
```

<!-- #endregion -->

The {meth}`~lamindb.curators.core.Curator.validate` method validates that your dataset adheres to the criteria defined by the `schema`.
It identifies which values are already validated (exist in the registries) and which are potentially problematic (do not yet exist in our registries).

```python
try:
    curator = ln.curators.DataFrameCurator(df, schema)
    curator.validate()
except ln.errors.ValidationError as error:
    print(error)
```

### Step 5: Fix validation issues

```python
# check the non-validated terms
curator.cat.non_validated
```

For `cell_type_by_expert`, we saw 2 terms are not validated.

First, let's standardize synonym "B-cell" as suggested

```python
curator.cat.standardize("cell_type_by_expert")
```

```python
# now we have only one non-validated cell type left
curator.cat.non_validated
```

For "CD8-pos alpha-beta T cell", let's understand which cell type in the public ontology might be the actual match.

```python
# to check the correct spelling of categories, pass `public=True` to get a lookup object from public ontologies
# use `lookup = curator.cat.lookup()` to get a lookup object of existing records in your instance
lookup = curator.cat.lookup(public=True)
lookup
```

```python
# here is an example for the "cell_type" column
cell_types = lookup["cell_type_by_expert"]
cell_types.cd8_positive_alpha_beta_t_cell
```

```python
# fix the cell type name
df["cell_type_by_expert"] = df["cell_type_by_expert"].cat.rename_categories(
    {"CD8-pos alpha-beta T cell": cell_types.cd8_positive_alpha_beta_t_cell.name}
)
```

For perturbation, we want to add the new values: "DMSO", "IFNG"

```python
# this adds perturbations that were _not_ validated
curator.cat.add_new_from("perturbation")
```

```python
ln.Feature.get(name="perturbation")
```

```python
# validate again
curator.validate()
```

### Step 6: Save your curated dataset

```python
artifact = curator.save_artifact(key="examples/my_curated_dataset.parquet")
```

```python
artifact.describe()
```

## Common fixes

This section covers the most frequent curation issues and their solutions.
Use this as a reference when validation fails.

### Feature validation issues

<!-- #region -->

**Issue**: "Column not in dataframe"

```
"column 'treatment' not in dataframe. Columns in dataframe: ['drug', 'timepoint', ...]"
```

**Solutions**:

```python
# Solution 1: Rename columns to match schema
df = df.rename(columns={
    'treatment': 'drug',
    'time': 'timepoint',
    ...
})

# Solution 2: Create missing columns
df['treatment'] = 'unknown'  # Add with default value (or define Feature.default_value)

# Solution 3: Modify schema to match your data
schema = ln.Schema(
    features=[
        ln.Feature.get(name="drug"),  # Use actual column name
        ln.Feature.get(name="timepoint"),
    ],
    ...
)
```

<!-- #endregion -->

### Value validation issues

<!-- #region -->

**Issue**: "Terms not validated in feature 'perturbation'"

```
2 terms not validated in feature 'cell_type': 'B-cell', 'CD8-pos alpha-beta T cell'
    1 synonym found: "B-cell" → "B cell"
    → curate synonyms via: .standardize("cell_type")
    for remaining terms:
    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('cell_type')
```

**Solutions**:

```python
# Solution 1: Use automatic standardization if given hint (handles synonyms))
curator.cat.standardize('cell_type')

# Solution 2: Manual mapping for complex cases
value_mapping = {
    'T-cells': 'T cell',
    'B-cells': 'B cell',
}
df['cell_type'] = df['cell_type'].map(value_mapping).fillna(df['cell_type'])

# Solution 3: Use public ontology lookup for correct names
lookup = curator.cat.lookup(public=True)
cell_types = lookup["cell_type"]
df['cell_type'] = df['cell_type'].cat.rename_categories({
    'CD8-pos T cell': cell_types.cd8_positive_alpha_beta_t_cell.name
})

# Solution 4: Add new legitimate terms
curator.cat.add_new_from("cell_type")
```

<!-- #endregion -->

### Data type issues

<!-- #region -->

**Issue**: "Expected categorical data, got object"

```
TypeError: Expected categorical data for cell_type, got object
```

**Solutions**:

```python
# Solution 1: Convert to categorical
df['cell_type'] = df['cell_type'].astype('category')

# Solution 2: Use coercion in feature definition
ln.Feature(name="cell_type", dtype=bt.CellType, coerce=True).save()
```

<!-- #endregion -->

### Organism-specific ontology issues

<!-- #region -->

**Issue**: "Terms not validated" for organism-specific ontologies like developmental stages

```
2 terms not validated in feature 'developmental_stage_ontology_id': 'MmusDv:0000142', 'MmusDv:0000022'
```

**Solution**: Specify organism-specific source in feature definition using `cat_filters`:

```python
# When defining the schema, specify the organism-specific source
mouse_source = bt.Source.filter(
    entity="bionty.DevelopmentalStage",
    organism="mouse"
).one()

schema = ln.Schema(
    features=[
        ln.Feature(
            name="developmental_stage_ontology_id",
            dtype=bt.DevelopmentalStage.ontology_id,
            cat_filters={"source": mouse_source}  # Specify organism-specific source
        )
    ],
    ...
)
```

This pattern applies to any ontology where the same registry serves multiple organisms (e.g., `DevelopmentalStage`, `Phenotype`, ...).

<!-- #endregion -->

## External data validation

Since not all metadata is always stored within the dataset itself, it is also possible to validate external metadata.

```{eval-rst}
.. literalinclude:: scripts/curate_dataframe_external_features.py
   :language: python
   :caption: curate_dataframe_external_features.py
```

```python
!python scripts/curate_dataframe_external_features.py
```

## Union dtypes

Some metadata columns might validate against several registries.

```{eval-rst}
.. literalinclude:: scripts/curate_dataframe_union_features.py
   :language: python
   :caption: curate_dataframe_union_features.py
```

```python
!python scripts/curate_dataframe_union_features.py
```

## AnnData

`AnnData` like all other data structures that follow is a composite structure that stores different arrays in different `slots`.

### Allow a flexible schema

We can also allow a flexible schema for an `AnnData` and only require that it's indexed with Ensembl gene IDs.

```{eval-rst}
.. literalinclude:: scripts/curate_anndata_flexible.py
   :language: python
   :caption: curate_anndata_flexible.py
```

Let's run the script.

```python
!python scripts/curate_anndata_flexible.py
```

Under-the-hood, this uses the following build-in schema ({func}`~lamindb.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs`):

```{eval-rst}
.. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
   :language: python
```

This schema tranposes the `var` DataFrame during curation, so that one validates and annotates the columns of `var.T`, i.e., `[ENSG00000153563, ENSG00000010610, ENSG00000170458]`.
If one doesn't transpose, one would annotate the columns of `var`, i.e., `[gene_symbol, gene_type]`.

```{eval-rst}
.. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/gLyfToATM7WUzkWW0001.png
    :width: 800px
```

### Fix validation issues

```python
adata = ln.examples.datasets.mini_immuno.get_dataset1(
    with_gene_typo=True, with_cell_type_typo=True, otype="AnnData"
)
adata
```

```python
schema = ln.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs()
schema.describe()
```

Check the slots of a schema:

```python
schema.slots
```

```python
curator = ln.curators.AnnDataCurator(adata, schema)
try:
    curator.validate()
except ln.errors.ValidationError as error:
    print(error)
```

As above, we leverage a lookup object with valid cell types to find the correct name.

```python
valid_cell_types = curator.slots["obs"].cat.lookup()["cell_type_by_expert"]
adata.obs["cell_type_by_expert"] = adata.obs[
    "cell_type_by_expert"
].cat.rename_categories(
    {"CD8-pos alpha-beta T cell": valid_cell_types.cd8_positive_alpha_beta_t_cell.name}
)
```

The validated `AnnData` can be subsequently saved as an {class}`~lamindb.Artifact`:

```python
adata.obs.columns
```

```python
curator.slots["var.T"].cat.add_new_from("columns")
```

```python
curator.validate()
```

```python
artifact = curator.save_artifact(key="examples/my_curated_anndata.h5ad")
```

Access the schema for each slot:

```python
artifact.features.slots
```

The saved artifact has been annotated with validated features and labels:

```python
artifact.describe()
```

## Unstructured dictionaries

Most datastructures support unstructured metadata stored as dictionaries:

- Pandas DataFrames: `.attrs`
- AnnData: `.uns`
- MuData: `.uns` and `modality:uns`
- SpatialData: `.attrs`

Here, we exemplary show how to curate such metadata for AnnData:

```{eval-rst}
.. literalinclude:: scripts/define_schema_anndata_uns.py
   :language: python
   :caption: define_schema_anndata_uns.py
```

```python
!python scripts/define_schema_anndata_uns.py
```

```{eval-rst}
.. literalinclude:: scripts/curate_anndata_uns.py
   :language: python
   :caption: curate_anndata_uns.py
```

```python
!python scripts/curate_anndata_uns.py
```

## MuData

```{eval-rst}
.. literalinclude:: scripts/curate_mudata.py
   :language: python
   :caption: curate_mudata.py
```

```python
!python scripts/curate_mudata.py
```

## SpatialData

```{eval-rst}
.. literalinclude:: scripts/define_schema_spatialdata.py
   :language: python
   :caption: define_schema_spatialdata.py
```

```python
!python scripts/define_schema_spatialdata.py
```

```{eval-rst}
.. literalinclude:: scripts/curate_spatialdata.py
   :language: python
   :caption: curate_spatialdata.py
```

```python
!python scripts/curate_spatialdata.py
```

## TiledbsomaExperiment

```{eval-rst}
.. literalinclude:: scripts/curate_soma_experiment.py
   :language: python
   :caption: curate_soma_experiment.py
```

```python
!python scripts/curate_soma_experiment.py
```

## Other data structures

If you have other data structures, read: {doc}`/faq/curate-any`.

```python
!rm -rf ./test-curate
!rm -rf ./small_dataset.tiledbsoma
!lamin delete --force test-curate
```


================================================
FILE: docs/faq/acid.md
================================================
---
execute_via: python
---

# Will data & metadata stay in sync?

Here, we walk through different errors that can occur while saving artifacts & metadata records, and show that the LaminDB instance does not get corrupted by dangling metadata or artifacts.

Transactions within Python across data & metadata are [ACID](https://en.wikipedia.org/wiki/ACID).

If an upload process is externally killed and Python cannot run clean-up operations anymore, the artifact is internally still flagged with `artifact._storage_ongoing = True`. This is visible on the UI. You can then re-run `lamin save` or `artifact.save()` to attempt uploading the artifact a second time.

```python
!lamin init --storage ./test-acid
```

```python
import pytest
import lamindb as ln
from upath import UPath

ln.settings.verbosity = "debug"
```

```python
open("sample.fasta", "w").write(">seq1\nACGT\n")
```

## Save error due to failed upload within Python

Let's try to save an artifact to a storage location without permission.

```python
artifact = ln.Artifact("sample.fasta", key="sample.fasta")
```

Because the public API only allows you to set a default storage for which you have permission, we need to hack it:

```python
ln.settings.storage._root = UPath("s3://nf-core-awsmegatests")
```

This raises an exception but nothing gets saved:

```python
with pytest.raises(PermissionError) as error:
    artifact.save()
print(error.exconly())
assert len(ln.Artifact.filter()) == 0
```

## Save error during bulk creation

```python
artifacts = [artifact, "this is not a record"]
```

This raises an exception but nothing gets saved:

```python
with pytest.raises(Exception) as error:
    ln.save(artifacts)
print(error.exconly())
assert len(ln.Artifact.filter()) == 0  # nothing got saved
```

If a list of data objects is passed to `ln.save()` and the upload of one of these data objects fails, the successful uploads are maintained and a `RuntimeError` is raised, listing the successfully uploaded data objects up until that point.

## Save error due to externally aborted upload

Back to a proper storage location:

```python
ln.settings.storage._root = UPath("./test-acid").absolute()
```

The save operation works:

```python
artifact.save()
```

Let's pretend the upload was killed.

```python
artifact._storage_ongoing = True
artifact.save()
artifact.path.unlink()
assert artifact._aux == {"so": 1}  # storage/upload is ongoing
```

We can re-run it:

```python
artifact = ln.Artifact("sample.fasta", key="sample.fasta").save()
```

```python
assert not artifact._storage_ongoing
assert artifact._aux is None
```

```python
!rm -r ./test-acid
!lamin delete --force test-acid
```


================================================
FILE: docs/faq/curate-any.md
================================================
---
execute_via: python
---

# How do I validate & annotate arbitrary data structures?

This guide walks through the low-level API that lets you validate iterables.

You can then use the records create inferred during validation to annotate a dataset.

:::{dropdown} How do I validate based on a public ontology?

LaminDB makes it easy to validate categorical variables based on registries that inherit from {class}`~lamindb.models.CanCurate`.

{class}`~lamindb.models.CanCurate` methods validate against the registries in your LaminDB instance.
In {doc}`/manage-ontologies`, you'll see how to extend standard validation to validation against _public references_ using a `PubliOntology` object, e.g., via `public_genes = bt.Gene.public()`.
By default, {meth}`~lamindb.models.CanCurate.from_values` considers a match in a public reference a validated value for any {mod}`bionty` entity.

:::

```python
# pip install 'lamindb[zarr]'
!lamin init --storage ./test-curate-any --modules bionty
```

Define a test dataset.

```python
import lamindb as ln
import bionty as bt
import zarr
import numpy as np

data = zarr.open_group(store="data.zarr", mode="a")

data.create_dataset(name="temperature", shape=(3,), dtype="float32")
data.create_dataset(name="knockout_gene", shape=(3,), dtype=str)
data.create_dataset(name="disease", shape=(3,), dtype=str)

data["knockout_gene"][:] = np.array(
    ["ENSG00000139618", "ENSG00000141510", "ENSG00000133703"]
)
data["disease"][:] = np.random.default_rng().choice(
    ["MONDO:0004975", "MONDO:0004980"], 3
)
```

## Validate and standardize vectors

Read the `disease` array from the zarr group into memory.

```python
disease = data["disease"][:]
```

{meth}`~lamindb.models.CanCurate.validate` validates vectore-like values against reference values in a registry.
It returns a boolean vector indicating where a value has an exact match in the reference values.

```python
bt.Disease.validate(disease, field=bt.Disease.ontology_id)
```

When validation fails, you can call {meth}`~lamindb.models.CanCurate.inspect` to figure out what to do.

{meth}`~lamindb.models.CanCurate.inspect` applies the same definition of validation as {meth}`~lamindb.models.CanCurate.validate`, but returns a rich return value {class}`~lamindb.models.InspectResult`. Most importantly, it logs recommended curation steps that would render the data validated.

Note: you can use {meth}`~lamindb.models.CanCurate.standardize` to standardize synonyms.

```python
bt.Disease.inspect(disease, field=bt.Disease.ontology_id)
```

Bulk creating records using {meth}`~lamindb.models.CanCurate.from_values` only returns validated records.

```python
diseases = bt.Disease.from_values(disease, field=bt.Disease.ontology_id).save()
```

Repeat the process for more labels:

```python
experiments = ln.Record.from_values(
    ["Experiment A", "Experiment B"],
    field=ln.Record.name,
    create=True,  # create non-validated labels
).save()
genes = bt.Gene.from_values(
    data["knockout_gene"][:], field=bt.Gene.ensembl_gene_id
).save()
```

## Annotate the dataset

Register the dataset as an artifact:

```python
artifact = ln.Artifact("data.zarr", key="my_dataset.zarr").save()
```

Annotate with features:

```python
ln.Feature(name="experiment", dtype=ln.Record).save()
ln.Feature(name="disease", dtype=bt.Disease.ontology_id).save()
ln.Feature(name="knockout_gene", dtype=bt.Gene.ensembl_gene_id).save()
artifact.features.set_values(
    {"experiment": experiments, "knockout_gene": genes, "disease": diseases}
)
artifact.describe()
```

```python
# clean up test instance
!rm -r data.zarr
!rm -r ./test-curate-any
!lamin delete --force test-curate-any
```


================================================
FILE: docs/faq/idempotency.md
================================================
---
execute_via: python
---

# Will data get duplicated upon re-running code?

LaminDB's operations are idempotent in the sense defined here, which allows you to re-run code without duplicating data.

:::{admonition} SQLRecords with `name` field

When you instantiate {class}`~lamindb.models.SQLRecord` with a name, in case a name has an _exact match_ in a registry, the constructor returns it instead of creating a new record. In case records with _similar names_ exist, you'll see them in a table: you can then decide whether you want to save the new record or pick an existing record.

If you set {attr}`~lamindb.core.subsettings.CreationSettings.search_names` to `False`, you bypass these checks.

:::

:::{admonition} Artifacts & collections

If you instantiate {class}`~lamindb.Artifact` from data that already exists as an artifact, the `Artifact()` constructor returns the existing artifact based on a hash lookup.

:::

```python
# pip install lamindb
!lamin init --storage ./test-idempotency
```

```python
import lamindb as ln

ln.track("ANW20Fr4eZgM0000")
```

## SQLRecords with name field

```python
assert ln.settings.creation.search_names
```

Let us add a first record to the {class}`~lamindb.Record` registry:

```python
label = ln.Record(name="My label 1").save()
```

If we create a new record, we'll automatically get search results that give clues on whether we are prone to duplicating an entry:

```python
label = ln.Record(name="My label 1a")
```

Let's save the `1a` label, we actually intend to create it.

```python
label.save()
```

In case we match an existing name directly, we'll get the existing object:

```python
label = ln.Record(name="My label 1")
```

If we save it again, it will not create a new entry in the registry:

```python
label.save()
```

Now, if we create a third record, we'll get two alternatives:

```python
label = ln.Record(name="My label 1b")
```

If we prefer to not perform a search, e.g. for performance reasons, we can switch it off.

```python
ln.settings.creation.search_names = False
label = ln.Record(name="My label 1c")
```

Switch it back on:

```python
ln.settings.creation.search_names = True
```

## Artifacts & collections

```python
filepath = ln.examples.datasets.file_fcs()
```

Create an `Artifact`:

```python
artifact = ln.Artifact(filepath, key="my_fcs_file.fcs").save()
```

```python
assert artifact.hash == "rCPvmZB19xs4zHZ7p_-Wrg"
assert artifact.run == ln.context.run
assert not artifact.recreating_runs.exists()
```

Create an `Artifact` from the same path:

```python
artifact2 = ln.Artifact(filepath, key="my_fcs_file.fcs")
```

It gives us the existing object:

```python
assert artifact.id == artifact2.id
assert artifact.run == artifact2.run
assert not artifact.recreating_runs.exists()
```

If you save it again, nothing will happen (the operation is idempotent):

```python
artifact2.save()
```

In the hidden cell below, you'll see how this interplays with data lineage.

```python
ln.track(new_run=True)
artifact3 = ln.Artifact(filepath, key="my_fcs_file.fcs")
assert artifact3.id == artifact2.id
assert artifact3.run == artifact2.run != ln.context.run  # run is not updated
assert artifact2.recreating_runs.first() == ln.context.run
```

```python
!rm -rf ./test-idempotency
!lamin delete --force test-idempotency
```


================================================
FILE: docs/faq/import-modules.md
================================================
---
execute_via: python
---

# What happens if I import a schema module without lamindb?

```python
# !pip install 'lamindb[bionty]'
!lamin init --storage testmodule --modules bionty
```

Upon `import`, nothing yet happens:

```python
import bionty as bt
```

If you try to access an attribute (other than `model`), you'll load the instance in the same way as calling `import lamindb`.

Under the hood, `lamindb` is imported!

```python
assert bt.Organism(name="human") is not None
```

```python
!lamin delete --force testmodule
```


================================================
FILE: docs/faq/keep-artifacts-local.md
================================================
---
execute_via: python
---

# Keep artifacts local in a cloud instance

If you want to default to keeping artifacts local in a cloud instance, enable {attr}`~lamindb.setup.core.InstanceSettings.keep_artifacts_local`.

Let us first create a cloud instance that woul store artifacts exclusively on S3.

```python
!lamin login testuser1
!lamin init --storage s3://lamindb-ci/keep-artifacts-local
```

Let's import lamindb and track the current notebook run.

```python
# pip install lamindb
import lamindb as ln

ln.track("l9lFf83aPwRc")
```

## Toggling setting "keep artifacts local"

You can checkmark the "Keep artifacts local" box on the instance settings tab.

<img src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/6Kt20kV5sQIFyV0Q0000.png" width="400px">

Or toggle it through the following instance setting.

```python
ln.setup.settings.instance.keep_artifacts_local = True
```

## Create a local storage location

Call the following for a -- potentially pre-existing -- root path and a unique host identifier.

```python
ln.Storage(root="./our_local_storage", host="abc-institute-drive1").save()
```

Now, you have two storage locations: one in the S3 bucket, and the other locally.

```python
ln.Storage.to_dataframe()
```

You can now set it as a local default storage location.
Next time you connect to the instance, this won't be necessary and the location will be automatically detected as the local default.

```python
ln.settings.local_storage = "./our_local_storage"
```

## Use a local storage location

If you save an artifact in keep-artifacts-local mode, by default, it's stored in local storage.

```python
original_filepath = ln.examples.datasets.file_fcs()
artifact = ln.Artifact(original_filepath, key="example_datasets/file1.fcs").save()
local_path = artifact.path  # local storage path
local_path
```

You'll see the `.fcs` file named by the `uid` in your `.lamindb/` directory under `./our_local_storage/`:

```python
assert artifact.path.exists()
assert artifact.path.as_posix().startswith(ln.settings.local_storage.root.as_posix())
ln.settings.local_storage.root.view_tree()
```

## Pre-existing artifacts

Assume you already have a file in your local storage location:

```python
file_in_local_storage = ln.examples.datasets.file_bam()
file_in_local_storage.rename("./our_local_storage/output.bam")
ln.UPath("our_local_storage/").view_tree()
```

When registering an artifact for it, it remains where it is.

```python
my_existing_file = ln.Artifact("./our_local_storage/output.bam").save()
ln.UPath("our_local_storage/").view_tree()
```

The storage path of the artifact matches the pre-existing file:

```python
my_existing_file.path
```

## Switching between local storage locations

You might have several local storage locations. Here is how you can switch between them.

```python
ln.Storage(root="./our_local_storage2", host="abc-institute-drive1").save()
ln.settings.local_storage = "./our_local_storage2"  # switch to the new storage location
```

Ingest a file into the new local storage location.

```python
filepath = ln.examples.datasets.file_fastq()
artifact3 = ln.Artifact(filepath, key="example_datasets/file.fastq.gz").save()
```

Inspect where all the files are.

```python
ln.Artifact.to_dataframe(include=["storage__root", "storage__region"])
```

## Upload a local artifact to the cloud

If you'd like to upload an artifact to the cloud storage location to more easily share it or view it through web applications, you pass `upload=True` to the `save()` method.

```python
artifact.save(upload=True)
```

You now see the artifact in the S3 bucket:

```python
ln.settings.storage.root.view_tree()
```

And it's no longer present in local storage:

```python
assert artifact.path.exists()
assert not local_path.exists()
assert artifact.path.as_posix().startswith(ln.settings.storage.root.as_posix())
ln.settings.local_storage.root.view_tree()
```

## Upload directly to the cloud

You can also directly upload via `upload=True`:

```python
filepath = ln.examples.datasets.file_mini_csv()
artifact2 = ln.Artifact(filepath, key="example_datasets/mini.csv").save(upload=True)
artifact2.path
```

Now we have two files on S3:

```python
ln.Artifact.to_dataframe(include="storage__root")
```

## Update storage description

You can add a description to the storage location by using the `description` field.

```python
storage_record = ln.Storage.get(root__endswith="our_local_storage")
storage_record.description = "Our shared directory for project X"
storage_record.save()
ln.Storage.to_dataframe()
```

## Delete the test instance

Delete the artifacts:

```python
artifact.delete(permanent=True)
artifact2.delete(permanent=True)
artifact3.delete(permanent=True)
my_existing_file.delete(permanent=True, storage=False)
```

Delete the instance:

```python
ln.setup.delete("keep-artifacts-local", force=True)
```


================================================
FILE: docs/faq/pydantic-pandera.md
================================================
---
execute_via: python
---

# Pydantic & Pandera vs. LaminDB

This doc explains conceptual differences between data validation with `pydantic`, `pandera`, and `LaminDB`.

```python
!lamin init --storage test-pydantic-pandera --modules bionty
```

Let us work with a test dataframe.

```python
import pandas as pd
import pydantic
import lamindb as ln
import bionty as bt
import pandera.pandas as pandera
import pprint

from typing import Literal, Any

df = ln.examples.datasets.mini_immuno.get_dataset1()
df
```

## Define a schema

### pydantic

```python
Perturbation = Literal["DMSO", "IFNG"]
CellType = Literal["T cell", "B cell"]
OntologyID = Literal["EFO:0008913"]


class ImmunoSchema(pydantic.BaseModel):
    perturbation: Perturbation
    cell_type_by_model: CellType
    cell_type_by_expert: CellType
    assay_oid: OntologyID
    concentration: str
    treatment_time_h: int
    donor: str | None

    class Config:
        title = "My immuno schema"
```

### pandera

```python
pandera_schema = pandera.DataFrameSchema(
    {
        "perturbation": pandera.Column(
            str, checks=pandera.Check.isin(["DMSO", "IFNG"])
        ),
        "cell_type_by_model": pandera.Column(
            str, checks=pandera.Check.isin(["T cell", "B cell"])
        ),
        "cell_type_by_expert": pandera.Column(
            str, checks=pandera.Check.isin(["T cell", "B cell"])
        ),
        "assay_oid": pandera.Column(str, checks=pandera.Check.isin(["EFO:0008913"])),
        "concentration": pandera.Column(str),
        "treatment_time_h": pandera.Column(int),
        "donor": pandera.Column(str, nullable=True),
    },
    name="My immuno schema",
)
```

### LaminDB

Features & labels are defined on the level of the database instance.
You can either define a schema with required (and optional) columns.

```python
ln.Record(name="DMSO").save()
ln.Record(name="IFNG").save()

# leverage ontologies through types ln.Record, bt.CellType, bt.ExperimentalFactor
lamindb_schema = ln.Schema(
    name="My immuno schema",
    features=[
        ln.Feature(name="perturbation", dtype=ln.Record).save(),
        ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
        ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
        ln.Feature(name="assay_oid", dtype=bt.ExperimentalFactor.ontology_id).save(),
        ln.Feature(name="concentration", dtype=str).save(),
        ln.Feature(name="treatment_time_h", dtype=int).save(),
        ln.Feature(name="donor", dtype=str, nullable=True).save(),
    ],
).save()
```

Or merely define a constraint on the feature identifier.

```python
lamindb_schema_only_itype = ln.Schema(
    name="Allow any valid features & labels", itype=ln.Feature
)
```

## Validate a dataframe

### pydantic

```python
class DataFrameValidationError(Exception):
    pass


def validate_dataframe(df: pd.DataFrame, model: type[pydantic.BaseModel]):
    errors = []

    for i, row in enumerate(df.to_dict(orient="records")):
        try:
            model(**row)
        except pydantic.ValidationError as e:
            errors.append(f"row {i} failed validation: {e}")

    if errors:
        error_message = "\n".join(errors)
        raise DataFrameValidationError(
            f"DataFrame validation failed with the following errors:\n{error_message}"
        )
```

```python
try:
    validate_dataframe(df, ImmunoSchema)
except DataFrameValidationError as e:
    print(e)
```

To fix the validation error, we need to update the `Literal` and re-run the model definition.

```python
Perturbation = Literal["DMSO", "IFNG"]
CellType = Literal[
    "T cell", "B cell", "CD8-positive, alpha-beta T cell"  # <-- updated
]
OntologyID = Literal["EFO:0008913"]


class ImmunoSchema(pydantic.BaseModel):
    perturbation: Perturbation
    cell_type_by_model: CellType
    cell_type_by_expert: CellType
    assay_oid: OntologyID
    concentration: str
    treatment_time_h: int
    donor: str | None

    class Config:
        title = "My immuno schema"
```

```python
validate_dataframe(df, ImmunoSchema)
```

### pandera

```python
try:
    pandera_schema.validate(df)
except pandera.errors.SchemaError as e:
    print(e)
```

### LaminDB

Because the term `"CD8-positive, alpha-beta T cell"` is part of the public `CellType` ontology, validation passes the first time.

If validation had not passed, we could have resolved the issue simply by adding a new term to the `CellType` registry rather than editing the code.
This also puts downstream data scientists into a position to update ontologies.

```python
curator = ln.curators.DataFrameCurator(df, lamindb_schema)
curator.validate()
```

What was the cell type validation based on? Let's inspect the `CellType` registry.

```python
bt.CellType.to_dataframe()
```

The `CellType` regsitry is hierachical as it contains the Cell Ontology.

```python
bt.CellType.get(name="CD8-positive, alpha-beta T cell").view_parents()
```

## Overview of validation properties

Importantly, LaminDB offers not only a `DataFrameCurator`, but also a `AnnDataCurator`, `MuDataCurator`, `SpatialDataCurator`, and `TiledbsomaCurator`.

The below overview only concerns validating dataframes.

### Experience of data engineer

| property                                                                                                                       | `pydantic`                                            | `pandera`                                             | `lamindb`                                                                             |
| ------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------- | ----------------------------------------------------- | ------------------------------------------------------------------------------------- |
| define schema as code                                                                                                          | yes, in form of a `pydantic.BaseModel`                | yes, in form of a `pandera.DataFrameSchema`           | yes, in form of a `lamindb.Schema`                                                    |
| define schema as a set of constraints without the need of listing fields/columns/features; e.g. useful if validating 60k genes | no                                                    | no                                                    | yes                                                                                   |
| update labels independent of code                                                                                              | not possible because labels are enums/literals        | not possible because labels are hard-coded in `Check` | possible by adding new terms to a registry                                            |
| built-in validation from public ontologies                                                                                     | no                                                    | no                                                    | yes                                                                                   |
| sync labels with ELN/LIMS registries without code change                                                                       | no                                                    | no                                                    | yes                                                                                   |
| can re-use fields/columns/features across schemas                                                                              | limited via subclass                                  | only in same Python session                           | yes because persisted in database                                                     |
| schema modifications can invalidate previously validated datasets                                                              | yes                                                   | yes                                                   | no because LaminDB allows to query datasets that were validated with a schema version |
| can use columnar organization of dataframe                                                                                     | no, need to iterate over potentially millions of rows | yes                                                   | yes                                                                                   |

### Experience of data consumer

| property                                    | `pydantic`                                                                    | `pandera`             | `lamindb`                              |
| ------------------------------------------- | ----------------------------------------------------------------------------- | --------------------- | -------------------------------------- |
| dataset is queryable / findable             | no                                                                            | no                    | yes, by querying for labels & features |
| dataset is annotated                        | no                                                                            | no                    | yes                                    |
| user knows what validation constraints were | no, because might not have access to code and doesn't know which code was run | no (same as pydantic) | yes, via `artifact.schema`             |

## Annotation & queryability

### Engineer: annotate the dataset

Either use the `Curator` object:

```python
artifact = curator.save_artifact(key="our_datasets/dataset1.parquet")
```

If you don't expect a need for Curator functionality for updating ontologies and standardization, you can also use the `Artifact` constructor.

```python
artifact = ln.Artifact.from_dataframe(
    df, key="our_datasets/dataset1.parquet", schema=lamindb_schema
).save()
```

### Consumer: see annotations

```python
artifact.describe()
```

### Consumer: query the dataset

```python
ln.Artifact.filter(perturbation="IFNG").to_dataframe()
```

### Consumer: understand validation

By accessing `artifact.schema`, the consumer can understand _how_ the dataset was validated.

```python
artifact.schema
```

```python
artifact.schema.features.to_dataframe()
```

## Nested data with dynamic keys

We will now examine another more complex example where data is nested with potentially arbitrary (dynamic) keys.
The example is inspired by the [CELLxGENE schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/6.0.0/schema.md#uns-dataset-metadata) where annotations are stored as dictionaries in the AnnData `.uns` slot.

```python
uns_dict = ln.examples.datasets.dict_cellxgene_uns()
pprint.pprint(uns_dict)
```

### pydantic

Pydantic is primed to deal with nested data.

```python
class Images(pydantic.BaseModel):
    fullres: str
    hires: str


class Scalefactors(pydantic.BaseModel):
    spot_diameter_fullres: float
    tissue_hires_scalef: float


class Library(pydantic.BaseModel):
    images: Images
    scalefactors: Scalefactors


class Spatial(pydantic.BaseModel):
    is_single: bool
    model_config = {"extra": "allow"}

    def __init__(self, **data):
        libraries = {}
        other_fields = {}

        # store all libraries under a single key for validation
        for key, value in data.items():
            if key.startswith("library_"):
                libraries[key] = Library(**value)
            else:
                other_fields[key] = value

        other_fields["libraries"] = libraries
        super().__init__(**other_fields)


class SpatialDataSchema(pydantic.BaseModel):
    organism_ontology_term_id: str
    spatial: Spatial


validated_data = SpatialDataSchema(**uns_dict)
```

However, pydantic either requires all dictionary keys to be known beforehand to construct the Model classes or workarounds to collect all keys for a single model.

### pandera

Pandera cannot validate dictionaries because it is designed for structured dataframe data.
Therefore, we need to flatten the dictionary to transform it into a DataFrame:

```python
def _flatten_dict(d: dict[Any, Any], parent_key: str = "", sep: str = "_"):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(_flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)
```

```python
def create_dynamic_schema(flattened_data: dict[str, Any]):
    schema_dict = {
        "organism_ontology_term_id": pandera.Column(str),
        "spatial_is_single": pandera.Column(bool),
    }

    for key in flattened_data.keys():
        if key.startswith("spatial_library_") and key.endswith("_images_fullres"):
            lib_prefix = key.replace("_images_fullres", "")
            schema_dict.update(
                {
                    f"{lib_prefix}_images_fullres": pandera.Column(str),
                    f"{lib_prefix}_images_hires": pandera.Column(str),
                    f"{lib_prefix}_scalefactors_spot_diameter_fullres": pandera.Column(
                        float
                    ),
                    f"{lib_prefix}_scalefactors_tissue_hires_scalef": pandera.Column(
                        float
                    ),
                }
            )

    return pandera.DataFrameSchema(schema_dict)


flattened = _flatten_dict(uns_dict)
df = pd.DataFrame([flattened])
spatial_schema = create_dynamic_schema(flattened)
validated_df = spatial_schema.validate(df)
```

Analogously to pydantic, pandera does not have out of the box support for dynamically named keys.
Therefore, it is necessary to dynamically construct a pydantic schema.

### LaminDB

Similarly, LaminDB currently requires constructing flattened dataframes to dynamically create features for the schema, which can then be used for validation with the DataFrameCurator.
Future improvements are expected, including support for a dictionary-specific curator.

```python
def create_dynamic_schema(flattened_data: dict[str, Any]) -> ln.Schema:
    features = []

    for key, value in flattened_data.items():
        if key == "organism_ontology_term_id":
            features.append(ln.Feature(name=key, dtype=bt.Organism.ontology_id).save())
        elif isinstance(value, bool):
            features.append(ln.Feature(name=key, dtype=bool).save())
        elif isinstance(value, (int, float)):
            features.append(ln.Feature(name=key, dtype=float).save())
        else:
            features.append(ln.Feature(name=key, dtype=str).save())

    return ln.Schema(name="Spatial data schema", features=features, coerce=True).save()


flattened = _flatten_dict(uns_dict)
flattened_df = pd.DataFrame([flattened])
spatial_schema = create_dynamic_schema(flattened)
curator = ln.curators.DataFrameCurator(flattened_df, spatial_schema)
curator.validate()
```

```{note}
Curators for scverse data structures allow for the specification of schema slots that access and validate dataframes in nested dictionary attributes like `.attrs` or `.uns`.
These schema slots use colon-separated paths like `'attrs:sample'` or `'uns:spatial:images'` to target specific dataframes for validation.
```


================================================
FILE: docs/faq/reference-field.md
================================================
---
execute_via: python
---

# Where to store external links and IDs?

When registering data in LaminDB, you might want to store a reference link or ID to indicate the source of the collection.

We have `reference` and `reference_type` fields for this purpose, they are available for {class}`~lamindb.Collection`, {class}`~lamindb.Transform`, {class}`~lamindb.Run` and {class}`~lamindb.Record`.

```python
# !pip install lamindb
!lamin init --storage testreference
```

```python
import lamindb as ln
```

Let's say we have a few donor samples that came form Vendor X, in order to chase back the orders, I'd like to keep track the donor ids provided by the vendor:

```python
ln.Record(
    name="donor 001", reference="VX984545", reference_type="Donor ID from Vendor X"
)
```

```python
!lamin delete --force testreference
```


================================================
FILE: docs/faq/search.md
================================================
---
execute_via: python
---

# How does search work?

```python
from laminci.db import setup_local_test_postgres

pgurl = setup_local_test_postgres()
!lamin init --name benchmark_search --db {pgurl} --modules bionty --storage ./benchmark_search
```

Here we show how to perform text search on `SQLRecord` and evaluate some search queries for the {class}`bionty.CellType` ontology.

```python
import lamindb as ln
import bionty as bt

SEARCH_QUERIES_EXACT = (
    "t cell",
    "stem cell",
    "b cell",
    "regulatory B cell",
    "Be2 cell",
    "adipocyte",
)
SEARCH_QUERIES_CONTAINS = ("t cel", "t-cel", "neural", "kidney", "kidne")
TOP_N = 20

bt.CellType.import_source()
```

```python
ln.Record(name="cat[*_*]").save()
```

## Search the registry

```python
for query in SEARCH_QUERIES_EXACT:
    print("Query:", query)
    qs = bt.CellType.search(query)
    display(qs.to_dataframe())

    assert query.lower() == qs[0].name.lower()
```

```python
for query in SEARCH_QUERIES_CONTAINS:
    print("Query:", query)
    qs = bt.CellType.search(query)
    display(qs.to_dataframe())

    top_record = qs[0]
    query = query.lower()
    assert query in top_record.name.lower() or query in top_record.synonyms.lower()
```

Check escaping of special characters.

```python
assert len(ln.Record.search("cat[")) == 1
```

```python
assert len(ln.Record.search("*_*")) == 1
```

## Search the public ontology

```python
ct_public = bt.CellType.public()

df = ct_public.search("b cell", limit=20)
assert df.iloc[0]["name"] == "B cell"
df
```

```python
!docker stop pgtest && docker rm pgtest
!lamin delete --force benchmark_search
```


================================================
FILE: docs/faq/symbol-mapping.md
================================================
---
execute_via: python
---

# Why should I not index datasets with gene symbols?

Gene symbols are widely used for readability, particularly for visualization. However, indexing datasets with gene symbols presents challenges:

- A single gene may have multiple symbols or aliases.
- Gene symbols change over time (e.g., _BRCA2_ was once _FACD_) without version tracking.
- The same symbol can represent different genes across species.
- Symbols may be misinterpreted by software (e.g., _SEPT9_ as "September 9" in Excel).
- Formatting inconsistencies exist (e.g., case sensitivity, special characters).

Using unique identifiers like ENSEMBL gene IDs addresses these issues by providing:

- A direct, stable mapping to genomic coordinates.
- Consistency across databases.
- Species-specific prefixes to prevent cross-species confusion.
- Unique, permanent identifiers with standardized formatting.

Storing ENSEMBL gene IDs alongside gene symbols offers readability for visualization while maintaining robust data integrity. During curation, validating against ENSEMBL gene IDs ensures accurate mapping.

If only symbols are available for a dataset, you can map them to ENSEMBL IDs using {meth}`~bionty.Gene.standardize`.

```python
# !pip install 'lamindb[bionty]'
!lamin init --storage test-symbols --modules bionty
```

```python
import lamindb as ln
import bionty as bt
import numpy as np
import pandas as pd
import anndata as ad

# create example AnnData object with gene symbols
rng = np.random.default_rng(42)
X = rng.integers(0, 100, size=(5, 10))
var = pd.DataFrame(
    index=pd.Index(
        [
            "BRCA1",
            "TP53",
            "EGFR",
            "KRAS",
            "PTEN",
            "MYC",
            "VEGFA",
            "IL6",
            "TNF",
            "GAPDH",
        ],
        name="symbol",
    )
)
adata = ad.AnnData(X=X, var=var)
adata.var
```

```python
# map Gene symbols to ENSEMBL IDs
gene_mapper = bt.Gene.standardize(
    adata.var.index,
    field=bt.Gene.symbol,
    return_field=bt.Gene.ensembl_gene_id,
    return_mapper=True,
    organism="human",
)
adata.var["ensembl_id"] = adata.var.index.map(
    lambda gene_id: gene_mapper.get(gene_id, gene_id)
)
adata.var
```

```python
standardized_genes = bt.Gene.from_values(
    [
        "ENSG00000141510",
        "ENSG00000133703",
        "ENSG00000111640",
        "ENSG00000171862",
        "ENSG00000204490",
        "ENSG00000112715",
        "ENSG00000146648",
        "ENSG00000136997",
        "ENSG00000012048",
        "ENSG00000136244",
    ],
    field=bt.Gene.ensembl_gene_id,
    organism="human",
)
ln.save(standardized_genes)
```

This allows for validating the the `ensembl_id` against the `Gene` registry using the `bt.Gene.ensembl_gene_id` field.

```python
bt.Gene.validate(adata.var["ensembl_id"], field=bt.Gene.ensembl_gene_id)
```

```{note}
Gene symbols do not map one-to-one with ENSEMBL IDs. A single gene symbol may correspond to multiple ENSEMBL IDs due to:

1. **Gene Paralogs**: Similar symbols can be shared among paralogous genes within the same species, resulting in one symbol linking to multiple ENSEMBL IDs.
2. **Pseudogenes**: Some symbols represent both functional genes and their non-functional pseudogenes, each with distinct ENSEMBL IDs.
3. **Transcript Variants**: One symbol may map to multiple ENSEMBL transcript IDs, each representing different isoforms or splice variants.

{meth}`~bionty.Gene.standardize` retrieves the first match in cases of multiple hits, which is generally sufficient but not perfectly accurate.
```

```python
!lamin delete --force test-symbols
```


================================================
FILE: docs/faq/test_notebooks.py
================================================
from pathlib import Path

import nbproject_test as test

import lamindb as ln


def test_notebooks():
    nbdir = Path(__file__).parent
    ln.setup.login("testuser1")
    ln.setup.init(storage=nbdir / "mydata")
    test.execute_notebooks(nbdir, write=True)


================================================
FILE: docs/faq/track-run-inputs.md
================================================
---
execute_via: python
---

# Can I disable tracking run inputs?

Yes, if you switch {attr}`~lamindb.core.Settings.track_run_inputs` to `False`.

```python
# pip install lamindb
!lamin init --storage test-run-inputs
```

```python
import lamindb as ln
```

Some test artifacts:

```python
ln.track(transform=ln.Transform(key="Dummpy pipeline"))
ln.Artifact(ln.examples.datasets.file_jpg_paradisi05(), description="My image").save()
ln.Artifact(ln.examples.datasets.file_mini_csv(), description="My csv").save()
```

Call `ln.track()`:

```python
ln.track("Rx2s9aPTMQLY0000")
```

## Don't track artifact as run input

```python
ln.settings.track_run_inputs = False
```

```python
artifact = ln.Artifact.get(description="My image")
```

```python
artifact.cache()
```

No run inputs are linked to the current notebook run:

```python
ln.Run.get(id=ln.context.run.id).input_artifacts.all()
```

```python
artifact.view_lineage()
```

```python
assert len(ln.Run.get(id=ln.context.run.id).input_artifacts.all()) == 0
```

## Manually track artifact as run input

Let us manually track an artifact by passing `is_run_input` to either `.cache()`, `.load()` or `.open()`:

```python
artifact.cache(is_run_input=True)
```

You can see the fcs artifact is now being added to the run inputs:

```python
for input in ln.Run.get(id=ln.context.run.id).input_artifacts.all():
    print(input)
```

```python
artifact.view_lineage()
```

```python
assert len(ln.Run.get(id=ln.context.run.id).input_artifacts.all()) == 1
```

## Automatically track artifacts as run input

If you switch the following setting, and call to `.load()`, `.cache()` and `.open()` will track the artifact as run input.

```python
ln.settings.track_run_inputs = True
```

```python
artifact = ln.Artifact.get(description="My csv")
```

```python
artifact.load()
```

```python
for input in ln.Run.get(id=ln.context.run.id).input_artifacts.all():
    print(input)
```

```python
artifact.view_lineage()
```

```python
assert len(ln.Run.get(id=ln.context.run.id).input_artifacts.all()) == 2
```

```python
!lamin delete --force test-run-inputs
```


================================================
FILE: docs/faq/trash-archive.md
================================================
# How do I trash or archive objects?

Any object in LaminDB has the following 3 levels of visibility through 3 default branches:

- `main`: visible
- `archive`: excluded from query & search
- `trash`: excluded from query & search, scheduled for deletion

Let's look at an example for an `Artifact` object while noting that the same applies to any other `SQLRecord`.

```python
import lamindb as ln
import pandas as pd

df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
artifact = ln.Artifact.from_dataframe(df, key="dataset.parquet").save()
```

An artifact is by default created on the `main` branch.

```python
assert artifact.branch.name == "main"
ln.Artifact.filter(key="dataset.parquet").to_dataframe()
# the artifact shows up
```

If you delete an artifact, it gets moved into the `trash` branch.

```python
artifact.delete()
assert artifact.branch.name == "trash"
```

Artifacts in trash won't show up in queries with default arguments:

```python
ln.Artifact.filter(key="dataset.parquet").to_dataframe()
# the artifact does not show up
```

You can query for them by adding the `trash` branch to the filter.

```python
ln.Artifact.filter(key="dataset.parquet", branch__name="trash").to_dataframe()
# the artifact shows up
```

You can restore an artifact from trash:

```python
artifact.restore()
ln.Artifact.filter(key="dataset.parquet").to_dataframe()
# the artifact shows up
```


================================================
FILE: docs/faq/validate-fields.md
================================================
---
execute_via: python
---

# Django field validation

[Django field validation](https://docs.djangoproject.com/en/5.1/ref/validators/) are enabled for models that inherit the `ValidateFields` class.

```python
# pip install lamindb
!lamin init --storage ./test-django-validation
```

```python
import lamindb as ln
from lamindb.core.exceptions import FieldValidationError
```

```python
try:
    ln.Reference(name="my ref", doi="abc.ef", url="myurl.com")
except FieldValidationError as e:
    print(e)
```

```python
!lamin delete --force test-django-validation
```


================================================
FILE: docs/faq.md
================================================
# FAQ

```{toctree}
:maxdepth: 1

faq/pydantic-pandera
faq/idempotency
faq/acid
faq/track-run-inputs
faq/curate-any
faq/import-modules
faq/reference-field
faq/trash-archive
faq/keep-artifacts-local
faq/validate-fields
faq/symbol-mapping
faq/search
```


================================================
FILE: docs/guide.md
================================================
# Guide

```{toctree}
:hidden:
:caption: "Overview"

README
```

```{toctree}
:hidden:
:caption: "How to"

query-search
track
organize
manage-changes
manage-ontologies
sync
```

```{toctree}
:hidden:
:caption: Use cases

lightning
```

```{toctree}
:hidden:
:caption: Other topics

faq
storage
```


================================================
FILE: docs/index.md
================================================
```{include} ../README.md
:start-line: 0
:end-line: 5
```

<meta http-equiv="Refresh" content="0; url=./guide.html" />

```{toctree}
:maxdepth: 1
:hidden:

guide
api
changelog
```


================================================
FILE: docs/lightning.md
================================================
# Lightning

This guide offers more context on the {class}`lamindb.integrations.lightning.Checkpoint` callback. For end-to-end examples, see the following guides:

- {doc}`docs:clearml`
- {doc}`docs:wandb`
- {doc}`docs:mlflow`

## Quickstart

Pass `ll.Checkpoint` and a logger into `Trainer`. The logger is what gives
checkpoints meaningful, namespaced artifact keys — without it, keys fall back
to a bare `checkpoints/` prefix (or just the run UID when `ln.track()` is
active).

Any logger implementing Lightning's `Logger` interface works (`TensorBoardLogger`,
`WandbLogger`, `MLFlowLogger`, `CSVLogger`, etc.). We use `TensorBoardLogger`
in the examples below.

```python
import lamindb as ln
import lightning.pytorch as pl
from lightning.pytorch.loggers import TensorBoardLogger
from lamindb.integrations import lightning as ll

ln.track()

logger = TensorBoardLogger(save_dir="logs")
checkpoint = ll.Checkpoint(monitor="val_loss", mode="min", save_top_k=3)

trainer = pl.Trainer(
    max_epochs=10,
    callbacks=[checkpoint],
    logger=logger,
)
trainer.fit(model, datamodule=datamodule)
```

After training, each saved checkpoint file is a LaminDB artifact:

```python
checkpoint.last_checkpoint_artifact
checkpoint.last_checkpoint_artifact.key
# e.g. "logs/lightning_logs/2r5pIRnK7z0q/checkpoints/epoch=0-step=100.ckpt"

checkpoint.checkpoint_key_prefix
# e.g. "logs/lightning_logs/2r5pIRnK7z0q/checkpoints"
```

### How is a run organized?

A Lightning `Trainer` coordinates three concerns during training:

1. **Logger** — writes metrics (loss curves, learning rate, etc.) to a dashboard directory. The logger determines the local directory layout: `{save_dir}/{name}/{version}/`.
2. **ModelCheckpoint** — saves model snapshots (`.ckpt` files) into a `checkpoints/` subdirectory underneath the logger's directory.
3. **SaveConfigCallback** — when using `LightningCLI`, writes the fully resolved `config.yaml` into the logger's directory so you can reproduce exactly which hyperparameters were used.

All three share the same directory tree. The logger creates it, the checkpoint callback writes into it, and the config callback stores beside it:

```
logs/                          # logger save_dir
  lightning_logs/              # logger name
    version_0/                 # logger version (local filesystem)
      events.out.tfevents.*    # ← logger output (TensorBoard)
      config.yaml              # ← SaveConfigCallback
      checkpoints/
        epoch=0-step=100.ckpt  # ← ModelCheckpoint
        epoch=1-step=200.ckpt
        hparams.yaml           # ← auto-generated by Lightning
```

LaminDB's integration replaces `ModelCheckpoint` with `ll.Checkpoint` and
Lightning's `SaveConfigCallback` with `ll.SaveConfigCallback`. Checkpoint
files, the config, and `hparams.yaml` become `lamindb.Artifact` records with
lineage tracking and optional feature annotations.

Note that artifact keys in LaminDB do **not** mirror the local directory layout
exactly — the callback uses the LaminDB run UID instead of Lightning's
auto-incrementing `version_N` directory by default. See
[How artifact keys are derived](#how-artifact-keys-are-derived) for details.

### Which kind of artifacts?

`Checkpoint` saves three kinds of artifacts:

| Kind         | Example key                           | When                                     |
| ------------ | ------------------------------------- | ---------------------------------------- |
| `checkpoint` | `…/checkpoints/epoch=0-step=100.ckpt` | Every time Lightning writes a checkpoint |
| `config`     | `…/config.yaml`                       | When using `ll.SaveConfigCallback`       |
| `hparams`    | `…/checkpoints/hparams.yaml`          | When Lightning generates it              |

Checkpoints and `hparams.yaml` live under the `checkpoints/` subdirectory,
while the config sits directly under the base prefix.

The callback tracks the latest artifact of each kind:

```python
checkpoint.last_checkpoint_artifact
checkpoint.last_config_artifact
checkpoint.last_hparams_artifact
checkpoint.last_artifact_event
```

### How is data lineage tracked?

When a run is being tracked with `ln.track()`:

- `checkpoint` artifacts are recorded as **run outputs** — they are produced by the training run.
- `config` artifacts are recorded as **run inputs** — the resolved config is part of the run specification.
- `hparams.yaml` is saved as an artifact but not linked as a run input.

## How are artifact keys derived?

LaminDB artifact keys are **not** necessarily a mirror of the local filesystem layout.
Lightning uses auto-incrementing version directories (`version_0`, `version_1`,
…) on disk, but these are meaningless as artifact identifiers — they depend on
what already exists locally and cannot reliably distinguish runs across
machines.

Instead, when `ln.track()` is active, the callback uses the **LaminDB run UID**
as the version segment by default (`run_uid_is_version=True`). This guarantees
that every tracked run produces unique artifact keys regardless of local state.

The base prefix is determined by priority:

| Scenario                 | Base prefix                            |
| ------------------------ | -------------------------------------- |
| `dirpath` set (± logger) | `{dirpath}/{run_uid}`                  |
| No `dirpath` + logger    | `{save_dir_basename}/{name}/{run_uid}` |
| No `dirpath` + no logger | `{run_uid}`                            |

`run_uid` above refers to the active LaminDB run UID (from `ln.context.run.uid`).
When no run is tracked or `run_uid_is_version=False`, the callback falls back
to the logger's own version (e.g. `version_0`) or omits the segment entirely.

**Checkpoint & hparams keys:**

| Scenario                      | LaminDB key pattern                                           |
| ----------------------------- | ------------------------------------------------------------- |
| Logger present (recommended)  | `{save_dir_basename}/{name}/{run_uid}/checkpoints/{filename}` |
| No logger, explicit `dirpath` | `{dirpath}/{run_uid}/checkpoints/{filename}`                  |
| No logger, no `dirpath`       | `{run_uid}/checkpoints/{filename}`                            |

**Config keys:**

| Scenario                      | Key pattern                                        |
| ----------------------------- | -------------------------------------------------- |
| Logger present                | `{save_dir_basename}/{name}/{run_uid}/config.yaml` |
| No logger, explicit `dirpath` | `{dirpath}/{run_uid}/config.yaml`                  |
| No logger, no `dirpath`       | `{run_uid}/config.yaml`                            |

For example, with `TensorBoardLogger(save_dir="logs")` and a tracked run:

```
logs/lightning_logs/2r5pIRnK7z0q/       # base prefix ({save_dir_basename}/{name}/{run_uid})
  config.yaml                            # ← config artifact
  checkpoints/
    epoch=0-step=100.ckpt                # ← checkpoint artifact
    hparams.yaml                         # ← hparams artifact
```

### Opting out of run UID keys

Pass `run_uid_is_version=False` to fall back to the logger-managed version
directory, matching Lightning's local layout more closely:

```python
checkpoint = ll.Checkpoint(
    monitor="val_loss",
    run_uid_is_version=False,
)
```

With this setting, the key uses the logger's version (`version_0`, etc.)
instead of the run UID. This is mainly useful when you don't call `ln.track()`
or when you want artifact keys that exactly mirror the local directory tree.

### Why run UIDs instead of `version_N`?

Lightning's auto-incrementing `version_N` depends on what directories already
exist at `save_dir`. Two runs on different machines — or the same machine after
clearing `logs/` — can both produce `version_0`. With `run_uid_is_version=True`
(the default), each tracked run gets a unique prefix derived from the Lamin
run, so artifact keys never collide.

## Use with the Lightning CLI

The Lightning CLI resolves a YAML config into concrete model and data module
arguments. To also store that resolved config as a LaminDB artifact, pass
`ll.SaveConfigCallback` in your training script and declare the trainer,
logger, callbacks, model, and data in a config file.

**`config.yaml`**

```yaml
trainer:
  max_epochs: 10

  logger:
    class_path: lightning.pytorch.loggers.TensorBoardLogger
    init_args:
      save_dir: logs

  callbacks:
    - class_path: lamindb.integrations.lightning.Checkpoint
      init_args:
        monitor: val/loss
        mode: min
        save_top_k: 3

model:
  learning_rate: 1.0e-3

data:
  batch_size: 64
```

**`train.py`**

```python
import lamindb as ln
from lightning.pytorch.cli import LightningCLI
from lamindb.integrations.lightning import SaveConfigCallback

ln.track()

def cli_main() -> None:
    LightningCLI(
        model_class=MyModel,
        datamodule_class=MyDataModule,
        save_config_callback=SaveConfigCallback,
    )

if __name__ == "__main__":
    cli_main()
```

```bash
python train.py fit --config config.yaml
```

`ll.SaveConfigCallback` extends Lightning's built-in version: it writes the
local file as usual and then delegates to whichever
`ArtifactPublishingModelCheckpoint` is registered on the trainer to persist the
config as an artifact.

## Annotating with features

Attach custom run-level and artifact-level feature values through `features=`:

```python
logger = TensorBoardLogger(save_dir="logs")
checkpoint = ll.Checkpoint(
    monitor="val_loss",
    features={
        "run": {"training_framework": "lightning"},
        "artifact": {"dataset_version": "2026-03"},
    },
)

trainer = pl.Trainer(callbacks=[checkpoint], logger=logger)
```

Feature names must already exist in Lamin.

The callback can also auto-track standard Lightning fields. Create the
corresponding LaminDB features once:

```python
ll.save_lightning_features()
```

This enables auto-features:

- Artifact-level: `is_best_model`, `is_last_model`, `score`, `model_rank`, `save_weights_only`, `monitor`, `mode`
- Run-level: `logger_name`, `logger_version`, `max_epochs`, `max_steps`,
  `precision`, `accumulate_grad_batches`, `gradient_clip_val`, `monitor`, `mode`

## Extending the callback

### Subclass `Checkpoint`

Subclass when you want to keep LaminDB persistence and additionally notify an
external system after each artifact is saved:

```python
from lamindb.integrations import lightning as ll
from my_model_registry import ModelRegistry


class ModelRegistryCheckpoint(ll.Checkpoint):
    """Register each checkpoint in an external model registry."""

    def __init__(self, *args, registry_project: str, **kwargs):
        super().__init__(*args, **kwargs)
        self.registry_project = registry_project
        self.model_registry = ModelRegistry()

    def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None:
        if event.kind == "checkpoint":
            # register the model in your external system
            self.model_registry.register(
                project=self.registry_project,
                model_uri=event.storage_uri,
                metadata={"lamin_key": event.key},
            )


logger = TensorBoardLogger(save_dir="logs")
checkpoint = ModelRegistryCheckpoint(
    registry_project="my-project",
    monitor="val_loss",
    save_top_k=3,
)
trainer = pl.Trainer(callbacks=[checkpoint], logger=logger)
trainer.fit(model, datamodule=datamodule)
```

Each event gives you:

- `event.kind`: `"checkpoint"`, `"config"`, or `"hparams"`
- `event.artifact`: the persisted LaminDB artifact
- `event.key`: the LaminDB artifact key
- `event.local_path`: the local file path Lightning wrote
- `event.storage_uri`: the stable storage URI for downstream systems

### Attach an observer

Observers are useful when you want composition instead of inheritance:

```python
from lamindb.integrations import lightning as ll


class ArtifactLogger:
    def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None:
        print(event.kind, event.storage_uri)

    def on_artifact_removed(self, event: ll.ArtifactRemovedEvent) -> None:
        print("removed", event.key)


logger = TensorBoardLogger(save_dir="logs")
checkpoint = ll.Checkpoint(
    monitor="val_loss",
    artifact_observers=[ArtifactLogger()],
)

trainer = pl.Trainer(callbacks=[checkpoint], logger=logger)
trainer.fit(model, datamodule=datamodule)
```

Observers receive the same events that subclasses see.

## Integrating other systems

To register checkpoints in another system (e.g. ClearML, Weights & Biases,
MLflow, Neptune, or Comet), use the artifact lifecycle events rather than
re-deriving paths from Lightning internals.

The key hand-off value is `event.storage_uri`, which resolves to the persisted
artifact location. `event.artifact` gives you the full LaminDB record when you
need metadata beyond the URI.


================================================
FILE: docs/manage-changes.md
================================================
# Manage changes

Managing changes in LaminDB is largely analogous to managing code changes via branching in git and Pull Requests in GitHub.

For usage examples, read the `Examples` section of the {class}`~lamindb.Branch` class.


================================================
FILE: docs/manage-ontologies.md
================================================
---
execute_via: python
---

# Manage biological ontologies

This guide shows how to manage ontologies for basic biological entities.

```{raw} html
<iframe width="560" height="315" src="https://www.youtube.com/embed/3vpWjHj3Kw8?si=D0jxqL2zB4idh2QA" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
```

If instead you're interested in

- accessing public ontologies, see {doc}`docs:public-ontologies`
- flexible bio registries for the wetlab (a LIMS), see {class}`~lamindb.Record` and {doc}`docs:records`

```python
# pip install lamindb
!lamin init --storage ./test-ontologies --modules bionty
```

## Import records from public ontologies

Let's first populate our {class}`~bionty.CellType` registry with the default public ontology (Cell Ontology).

```python
import lamindb as ln
import bionty as bt

# inspect the available public ontology versions
bt.Source.to_dataframe()
```

```python
# inspect which ontology version we're about to import
bt.Source.get(entity="bionty.CellType", currently_used=True)
```

```python
# populate the database with a public ontology
bt.CellType.import_source()
```

This is now your in-house cell type ontology in which you can add & modify records as you like. It's a registry just like `Artifact` or `Record`.

```python
# all public cell types are now available in LaminDB
bt.CellType.to_dataframe()
```

```python
# let's also populate the Gene registry with human and mouse genes
bt.Gene.import_source(organism="human")
bt.Gene.import_source(organism="mouse")
```

## Access records in in-house registries

Search key words:

```python
bt.CellType.search("gamma-delta T").to_dataframe().head(2)
```

Or look up with auto-complete:

```python
cell_types = bt.CellType.lookup()
hsc_record = cell_types.hematopoietic_stem_cell
hsc_record
```

Filter by fields and relationships:

```python
gdt_cell = bt.CellType.get(ontology_id="CL:0000798", created_by__handle="testuser1")
gdt_cell
```

View the ontological hierarchy:

```python
gdt_cell.view_parents()  # pass with_children=True to also view children
```

Or access the parents and children directly:

```python
gdt_cell.parents.to_dataframe()
```

```python
gdt_cell.children.to_dataframe()
```

It is also possible to recursively query parents or children, getting direct parents (children), their parents, and so forth.

```python
gdt_cell.query_parents().to_dataframe()
```

```python
gdt_cell.query_children().to_dataframe()
```

## Construct custom hierarchies of records

You can add a child of a parent record:

```python
# register a new cell type
my_celltype = bt.CellType(name="my new T-cell subtype").save()
# specify "gamma-delta T cell" as a parent
my_celltype.parents.add(gdt_cell)

# visualize hierarchy
my_celltype.view_parents(distance=3)
```

## Create new records

When accessing datasets, one often encounters bulk references to entities that might be corrupted or standardized using different standardization schemes.

Let's consider an example based on an `AnnData` object, in the `cell_type` annotations of this `AnnData` object, we find 4 references to cell types:

```python
adata = ln.examples.datasets.anndata_with_obs()
adata.obs.cell_type.value_counts()
```

We'd like to load the corresponding records in our in-house registry to annotate a dataset.

To this end, you'll typically use {class}`~lamindb.models.CanCurate.from_values`, which will both validate & retrieve records that match the values.

```python
cell_types = bt.CellType.from_values(adata.obs.cell_type)
cell_types
```

Logging informed us that 3 cell types were validated. Since we loaded these records at the same time, we could readily use them to annotate a dataset.

:::{dropdown} What happened under-the-hood?

`.from_values()` performs the following look ups:

1. If registry records match the values, load these records
2. If values match synonyms of registry records, load these records
3. If no record in the registry matches, attempt to load records from a public ontology
4. Same as 3. but based on synonyms

No records will be returned if all 4 look ups are unsuccessful.

Sometimes, it's useful to treat validated records differently from non-validated records. Here is a way:

```
original_values = ["gut", "gut2"]
inspector = bt.Tissue.inspect(original_values)
records_from_validated_values = bt.Tissue.from_values(inspector.validated)
```

:::

Alternatively, we can retrieve records based on ontology ids:

```python
adata.obs.cell_type_id.unique().tolist()
```

```python
bt.CellType.from_values(adata.obs.cell_type_id, field=bt.CellType.ontology_id)
```

## Validate & standardize

Simple validation of an iterable of values works like so:

```python
bt.CellType.validate(["fat cell", "blood forming stem cell"])
```

Because these values don't comply with the registry, they're not validated!

You can easily convert these values to validated standardized names based on synonyms like so:

```python
bt.CellType.standardize(["fat cell", "blood forming stem cell"])
```

Alternatively, you can use `.from_values()`, which will only ever return validated records and automatically standardize under-the-hood:

```python
bt.CellType.from_values(["fat cell", "blood forming stem cell"])
```

If you are now sure what to do, use `.inspect()` to get instructions:

```python
bt.CellType.inspect(["fat cell", "blood forming stem cell"]);
```

We can also add new synonyms to a record:

```python
hsc_record.add_synonym("HSC")
```

And when we encounter this synonym as a value, it will now be standardized using synonyms-lookup, and mapped on the correct registry record:

```python
bt.CellType.standardize(["HSC"])
```

A special synonym is `.abbr` (short for abbreviation), which has its own field and can be assigned via:

```python
hsc_record.set_abbr("HSC")
```

You can create a lookup object from the `.abbr` field:

```python
cell_types = bt.CellType.lookup("abbr")
cell_types.hsc
```

The same workflow works for all of `bionty`'s registries.

## Manage ontologies across organisms

Several registries are organism-aware (has a `.organism` field), for instance, {class}`~bionty.Gene`.

In this case, API calls that interact with multi-organism registries require an `organism` argument when there's ambiguity.

For instance, when validating gene symbols:

```python
bt.Gene.validate(["TCF7", "ABC1"], organism="human")
```

In contrary, working with Ensembl Gene IDs doesn't require passing `organism`, as there's no ambiguity:

```python
bt.Gene.validate(
    ["ENSG00000000419", "ENSMUSG00002076988"], field=bt.Gene.ensembl_gene_id
)
```

When working with the same organism throughout your analysis/workflow, you can omit the `organism` argument by configuring it globally:

```python
bt.settings.organism = "mouse"
bt.Gene.from_source(symbol="Ap5b1")
```

## Track ontology versions

Under-the-hood, source ontology versions are automatically tracked for each registry:

```python
bt.Source.filter(currently_used=True).to_dataframe()
```

Each record is linked to a versioned public source (if it was created from public):

```python
hepatocyte = bt.CellType.get(name="hepatocyte")
hepatocyte.source
```

## Create records from a specific ontology version

By default, new records are imported or created from the `"currently_used"` public sources which are configured during the instance initialization, e.g.:

```python
bt.Source.filter(entity="bionty.Phenotype", currently_used=True).to_dataframe()
```

Sometimes, the default source doesn't contain the ontology term you are looking for.

You can then specify to create a record from a non-default source. For instance, we can use the `ncbitaxon` ontology:

```python
source = bt.Source.get(entity="bionty.Organism", name="ncbitaxon")
source
```

```python
# validate against the NCBI Taxonomy
bt.Organism.validate(
    ["iris setosa", "iris versicolor", "iris virginica"], source=source
)
```

```python
# since we didn't seed the Organism registry with the NCBITaxon public ontology
# we need to save the records to the database
records = bt.Organism.from_values(
    ["iris setosa", "iris versicolor", "iris virginica"], source=source
).save()

# now we can query a iris organism and view its parents and children
bt.Organism.get(name="iris").view_parents(with_children=True)
```

<!-- #region -->

## Access any Ensembl genes

Genes from all Ensembl versions and organisms can be accessed, even though they are not yet present in the `bt.Source` registry.

For instance, if you want to use `rabbit` genes from Ensembl version `release-103`:

```python

# pip install pymysql
import bionty as bt

# automatically download genes for a new organism
gene_ontology = bt.base.Gene(source="ensembl", organism="rabbit", version='release-103')

# register the new source in lamindb
gene_ontology.register_source_in_lamindb()

# now you can start using this source

# import all genes from this source to your Gene registry
source = bt.Source.get(entity="bionty.Gene", name="ensembl", organism="rabbit", version="release-103")
bt.Gene.import_source(source=source)
```

<!-- #endregion -->


================================================
FILE: docs/organize.md
================================================
# Organize datasets

```{toctree}
:maxdepth: 1
:hidden:

curate
```

This guide walks through organizing datasets using files & folders, database relationships, and versioned collections.

## Via files & folders

You can use LaminDB like a file system. Similar to AWS S3, you organize artifacts into virtual folders using `/`-separated keys. To ingest a single file into a `project1/` folder, you'd call:

```python
artifact1 = ln.Artifact("./dataset.csv", key="project1/dataset1.csv").save()
```

For convenience, if you want to create an artifact for every file in a directory, use {meth}`~lamindb.Artifact.from_dir`:

```python
artifacts = ln.Artifact.from_dir("./project1/").save()
```

You can then query for all artifacts in the `"./project1/"` folder via:

```python
artifacts = ln.Artifact.filter(key__startswith="project1/")
```

Unlike a regular file system, every artifact is versioned and comes with rich metadata.

:::{dropdown} What if I do not care about the metadata and version of every file in a folder?

In some cases a folder _is_ the dataset and you don't need fine-grained information for every file.
In this scenario, save the entire directory as a single artifact:

```python
ln.Artifact("./folder_abc", key="folder_abc").save()
```

:::

## Via relationships in the database

### Annotating with projects

What if an artifact is relevant to multiple projects?
A dataset that's in the `project1/` folder cannot **also** reside in a `project2/` folder.
You can solve this problem with the `artifact.projects` relationship that links the {class}`~lamindb.Project` to {class}`~lamindb.Artifact`:

<img width="400" alt="image" src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/uVm5ptyqukPEKCix0000.png"/>

Here is how to annotate one artifact with two projects:

```python
project1 = ln.Project(name="Project 1").save()  # create project 1
project2 = ln.Project(name="Project 2").save()  # create project 2
artifact1.projects.add(project1, project2)      # annotate artifact1
```

This allows you to retrieve `artifact1` by querying any project it belongs to:

```python
artifacts_in_project1 = ln.Artifact.filter(projects=project1)
artifacts_in_project2 = ln.Artifact.filter(projects=project2)
```

Here, `artifact1` is part of both query results.

:::{dropdown} Three additional advantages to using related registries rather than folder structures.

1. Projects can be richly annotated (e.g., with start/end dates, parent projects, or member roles).
2. You no longer need to rely on fragile file paths. If a folder is renamed, path-based retrieval breaks, but a project query by `uid` will always work.[^protectproject]
3. You can run a constrained query or search against all projects in your database rather than trying to narrow a search to folder names.

:::

### Annotating with labels

You can annotate with other entity types, not just projects. LaminDB offers two main classes for this: {class}`~lamindb.Record` for metadata records and {class}`~lamindb.ULabel` for simple labels, which are both link to artifacts:

<img width="400" alt="image" src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/qvhxt6UuoUO2Bd820000.png"/>

Here is how to annotate with a ulabel and with a sample record:

```python
ulabel1 = ln.ULabel(name="raw_data").save()  # create a ulabel
artifact1.ulabels.add(ulabel1)               # annotate artifact1

sample_type = ln.Record(                     # create a record type "Samples"
    name="Samples",
    is_type=True
).save()
record1 = ln.Record(                         # create a sample record
    name="My sample",
    features={"gc_content": 0.5}
).save()
artifact1.records.add(record1)               # annnotate artifact1
```

You can use records and ulabels alongside entity types in modules such as {mod}`bionty`:

```python
import bionty as bt

cell_type1 = bt.CellType.from_source(
    name="T cell"                            # create a cell type from a public ontology
).save()
artifact1.cell_types.add(cell_type1)         # annotate artifact1
```

### Annotating with features

To annotate with non-categorical data types or to disambiguate categorical annotations, use {class}`~lamindb.Feature` objects.

<img width="400" alt="image" src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/eT6SEny5HpQQNgFl0000.png"/>

Here is how to define features and annotate an artifact with feature values:

```python
exp_type = ln.Record.get(name="Experiments")          # query the entity type `Experiments`
ln.Feature(name="gc_content", dtype=float).save()     # define a feature with dtype float
ln.Feature(name="experiment", dtype=exp_type).save()  # define a feature with dtype `Experiments`
artifact.features.set_values({
    "gc_content": 0.55,                               # validated to be a float
    "experiment": "Experiment 1",                     # validated to exist under the `Experiments` record type
})
```

When you work with structured data formats like `DataFrame` or `AnnData`, it often makes sense to validate the content of their features. After validation, the parsed feature values are automatically used for annotation. The easiest way is to use validation and auto-annotation is the built-in schema `"valid_features"`:

```python
# validate columns in the dataframe and map them on features
# auto-annotate with parsed metadata
ln.Artifact.from_dataframe(df, schema="valid_features").save()
```

Below is an example from the {doc}`docs:tutorial` illustrating how you get e.g. cell type, treatment, and assay annotations based on a dataframe's content. You can read more on this in {doc}`/curate`.

<img width="600px" src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/6sofuDVvTANB0f480003.png">

### Annotating with data-lineage

When you call {func}`~lamindb.track` or decorate a function with {func}`~lamindb.flow`, you automatically annotate artifacts with {class}`~lamindb.Run` and {class}`~lamindb.Transform` objects.

<img width="400" alt="image" src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/Z1iliqp5mInQQ2iY0000.png"/>

Here is how:

```{eval-rst}
.. literalinclude:: scripts/run_track_and_finish.py
   :language: python
```

Note that you can pass `project` to {func}`~lamindb.track` to auto-annotate all objects that are created in a run with a project label. Read more in {doc}`/track`.

### Overview of auto-generated annotations

The {class}`~lamindb.Artifact` registry has simple fields (such as `description`, `created_at`, `size`) and related fields (such as `projects`, `created_by`, `storage`). Many of these fields are automatically populated and you can use them to retrieve sets of artifacts.

<img width="800px" src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/HMfWLa1rFkxcxQEN0000.svg">

All other registries link to {class}`~lamindb.Artifact` to provide context for finding, querying, validating, and managing artifacts.[^starsnowflake]

:::{dropdown} Can you give me some example queries?

Here are examples leveraging auto-populated fields.

```python
artifacts = ln.Artifact.filter(
    created_at__gt="2023-06-24",    # created after June 24th, 2023
    size__lt=1e9,                   # smaller than 1GB
    suffix=".parquet",              # with a .parquet suffix
    n_observations__gt=1000,        # with more than 1000 observations
    n_files__gt=1000,               # folder-like artifacts with more than 1000 files
    otype="DataFrame",              # that are DataFrames
    created_on__name="my-branch",   # created on a specific branch or environment
    created_by__handle="falexwolf", # created by user with handle falexwolf
    run=run,                        # created by a specific run
    transform__name="my-script.py", # created by a specific script/notebook
)
```

:::

## Versioned collections of artifacts

If you want to group artifacts by metadata and version the entire set, use {class}`~lamindb.Collection`.

<img width="160" alt="image" src="https://lamin-site-assets.s3.amazonaws.com/.lamindb/QR0KuktVEnVL08K90000.png"/>

Unlike during annotation, you have to pass an entire group of artifacts to a `Collection` constructor:

```python
collection = ln.Collection([artifact1, artifact2], key="my_data_release").save()
```

And unlike the folder-based or annotation-based sets of artifacts — which can change as artifacts are added or removed — a collection guarantees an exact, immutable set of artifacts.

Artifacts are versioned based on the hash of their content. Collections are versioned based on the top-level hash of their artifact hashes. If you use the {meth}`~lamindb.Collection.append` method, a new version of the collection is created, and the old version is left unchanged:

```python
collection_v2 = collection.append(artifact3)
```

While collections are indirectly annotated through the annotations of the artifacts they contain, you can also add collection-level annotations. Like artifacts, collections link to projects, runs, ulabels, records, and most other registries.

[^starsnowflake]: You can consider the SQL table underlying {class}`~lamindb.Artifact` your _fact table_ and all other tables for other entities your _dimension tables_ in a star or Snowflake schema ([see Wikipedia](https://en.wikipedia.org/wiki/Fact_table)).

[^protectproject]: The project annotation of the artifact is protected against the deletion of the project. If a user with necessary rights attempts to delete the project, they will get an error.


================================================
FILE: docs/pertdb.md
================================================
# `pertdb`

```{eval-rst}
.. automodule:: pertdb
```


================================================
FILE: docs/query-search.md
================================================
# Query, search & stream

```{toctree}
:maxdepth: 1

registries
arrays
```


================================================
FILE: docs/registries.md
================================================
---
execute_via: python
---

# Query & search registries

This guide walks through different ways of querying & searching LaminDB registries.
To understand the underlying cross-linking of objects in the SQL database, read {doc}`organize`.

If you already have a set of artifacts, e.g. in the form of parquet files, and you'd like to now query/stream their (validated) content, read {doc}`arrays`.

```python
# initialize a test database to run examples
!lamin init --storage ./test-registries --modules bionty
```

Let's start by creating a few exemplary datasets:

```python
import lamindb as ln

ln.Artifact(ln.examples.datasets.file_fastq(), key="raw/my_fastq.fastq.gz").save()
ln.Artifact(ln.examples.datasets.file_jpg_paradisi05(), key="my_image.jpg").save()
ln.Artifact.from_dataframe(ln.examples.datasets.df_iris(), key="iris.parquet").save()
ln.examples.datasets.mini_immuno.save_mini_immuno_datasets()
```

## Get an overview

The easiest way to get an overview over all artifacts is by typing {meth}`~lamindb.Artifact.to_dataframe`, which returns the 100 latest artifacts in the {class}`~lamindb.Artifact` registry.

```python
ln.Artifact.to_dataframe()
```

You can include features.

```python
ln.Artifact.to_dataframe(include="features")
```

You can include fields from other registries.

```python
ln.Artifact.to_dataframe(
    include=[
        "created_by__name",
        "records__name",
        "cell_types__name",
        "schemas__itype",
    ]
)
```

You can also get an overview of the entire database.

```python
ln.view()
```

## Auto-complete objects

For registries with less than 100k objects, auto-completing a `Lookup` object is the most convenient way of finding a record.

```python
records = ln.Record.lookup()
```

With auto-complete, we find a record:

```python
experiment_1 = records.experiment_1
experiment_1
```

This works for any {class}`~lamindb.models.BaseSQLRecord` class, e.g., also for plugin `bionty`.

```python
import bionty as bt

cell_types = bt.CellType.lookup()
```

## Get one object

{meth}`~lamindb.models.BaseSQLRecord.get` errors if none or more than one matching objects are found.

```python
ln.Record.get(experiment_1.uid)  # by uid
ln.Record.get(name="Experiment 1")  # by field
```

## Query objects by fields

Use {meth}`~lamindb.models.BaseSQLRecord.filter` to query all artifacts by the `suffix` field:

```python
qs = ln.Artifact.filter(suffix=".h5ad")
qs
```

This returns a {class}`~lamindb.models.QuerySet`, which lazily references the set of {class}`~lamindb.models.BaseSQLRecord` objects that matches the filter statement. You can iteratively filter a queryset:

```python
qs = qs.filter(records__name="Experiment 1")
```

To access the results encoded in a queryset, call:

- {meth}`~lamindb.models.BasicQuerySet.to_dataframe`: A pandas `DataFrame` with each record in a row.
- {meth}`~lamindb.models.BasicQuerySet.one`: Exactly one record. Will raise an error if there is none. Is equivalent to the `.get()` method shown above.
- {meth}`~lamindb.models.BasicQuerySet.one_or_none`: Either one record or `None` if there is no query result.

Alternatively,

- use the `QuerySet` as an iterator
- get individual objects via `qs[0]`, `qs[1]`

For example:

```python
qs.to_dataframe()
```

Note that the `SQLRecord` classes in LaminDB are Django Models and any [Django query](https://docs.djangoproject.com/en/stable/topics/db/queries/) works.

## Query objects by features

The `Artifact`, `Record`, and `Run` registries can be queried by features.

```python
ln.Artifact.filter(perturbation="DMSO").to_dataframe(include="features")
```

You can also query by passing a `Feature` object, which is useful to disambiguate feature names.

```python
perturbation = ln.Feature.get(name="perturbation")  # can optionally pass a feature type to disambiguate
ln.Artifact.filter(perturbation == "DMSO")  # note this is now an expression using the == syntax
```

Just like for fields holding dictionary values, you can query for dictionary keys in features whose `dtype` is `dict`:

```python
ln.Artifact.filter(study_metadata__detail1="123").to_dataframe(include="features")
```

```python
ln.Artifact.filter(study_metadata__detail2=2).to_dataframe(include="features")
```

You can query for whether a dataset is annotated or not annotated by a feature.

```python
ln.Artifact.filter(perturbation__isnull=True).to_dataframe(include="features")
```

```python
ln.Artifact.filter(perturbation__isnull=False).to_dataframe(include="features")
```

## Query runs by parameters

Here is an example for querying by parameters: {ref}`track-run-parameters`.

## Search for objects

You can search every registry via {meth}`~lamindb.models.SQLRecord.search`. For example, the `Artifact` registry.

```python
ln.Artifact.search("iris").to_dataframe()
```

Here is more background on search and examples for searching the entire cell type ontology: {doc}`/faq/search`

## Query related registries

Django has a double-under-score syntax to filter based on related tables.

This syntax enables you to traverse several layers of relations and leverage different comparators.

```python
ln.Artifact.filter(created_by__handle__startswith="testuse").to_dataframe()
```

The filter selects all artifacts based on the users who ran the generating notebook. Under the hood, in the SQL database, it's joining the artifact table with the user table.

Another typical example is querying all datasets that measure a particular feature. For instance, which datasets measure `"CD8A"`. Here is how to do it:

```python
cd8a = bt.Gene.get(symbol="CD8A")
# query for all feature sets that contain CD8A
schemas_with_cd8a = ln.Schema.filter(genes=cd8a)
# get all artifacts
ln.Artifact.filter(schemas__in=schemas_with_cd8a).to_dataframe()
```

Instead of splitting this across three queries, the double-underscore syntax allows you to define a path for one query.

```python
ln.Artifact.filter(schemas__genes__symbol="CD8A").to_dataframe()
```

## Filter operators

You can qualify the type of comparison in a query by using a comparator.

Below follows a list of the most import, but Django supports about [two dozen field comparators](https://docs.djangoproject.com/en/stable/ref/models/querysets/#field-lookups) `field__comparator=value`.

### and

```python
ln.Artifact.filter(suffix=".h5ad", records=experiment_1).to_dataframe()
```

### less than/ greater than

Or subset to artifacts greater than 10kB. Here, we can't use keyword arguments, but need an explicit where statement.

```python
ln.Artifact.filter(records=experiment_1, size__gt=1e4).to_dataframe()
```

### in

```python
ln.Artifact.filter(suffix__in=[".jpg", ".fastq.gz"]).to_dataframe()
```

### order by

```python
ln.Artifact.filter().order_by("created_at").to_dataframe()
```

```python
# reverse ordering
ln.Artifact.filter().order_by("-created_at").to_dataframe()
```

```python
ln.Artifact.filter().order_by("key").to_dataframe()
```

```python
# reverse ordering
ln.Artifact.filter().order_by("-key").to_dataframe()
```

### contains

```python
ln.Transform.filter(description__contains="search").to_dataframe().head(5)
```

And case-insensitive:

```python
ln.Transform.filter(description__icontains="Search").to_dataframe().head(5)
```

### startswith

```python
ln.Transform.filter(description__startswith="Query").to_dataframe()
```

### or

```python
ln.Artifact.filter(ln.Q(suffix=".jpg") | ln.Q(suffix=".fastq.gz")).to_dataframe()
```

### negate/ unequal

```python
ln.Artifact.filter(~ln.Q(suffix=".jpg")).to_dataframe()
```


================================================
FILE: docs/scripts/curate_anndata_flexible.py
================================================
import lamindb as ln

ln.examples.datasets.mini_immuno.define_features_labels()
adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData")
artifact = ln.Artifact.from_anndata(
    adata,
    key="examples/mini_immuno.h5ad",
    schema="ensembl_gene_ids_and_valid_features_in_obs",
).save()
artifact.describe()


================================================
FILE: docs/scripts/curate_anndata_uns.py
================================================
import lamindb as ln

ln.examples.datasets.mini_immuno.define_features_labels()
adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData")
schema = ln.Schema.get(name="Study metadata schema")
artifact = ln.Artifact.from_anndata(
    adata, schema=schema, key="examples/mini_immuno_uns.h5ad"
)
artifact.describe()


================================================
FILE: docs/scripts/curate_dataframe_attrs.py
================================================
import lamindb as ln

from .define_schema_df_metadata import study_metadata_schema

df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame")
schema = ln.Schema(
    features=[ln.Feature(name="perturbation", dtype="str").save()],
    slots={"attrs": study_metadata_schema},
    otype="DataFrame",
).save()
curator = ln.curators.DataFrameCurator(df, schema=schema)
curator.validate()
artifact = curator.save_artifact(key="examples/df_with_attrs.parquet")
artifact.describe()


================================================
FILE: docs/scripts/curate_dataframe_external_features.py
================================================
import lamindb as ln
from datetime import date

df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame")

temperature = ln.Feature(name="temperature", dtype=float).save()
date_of_study = ln.Feature(name="date_of_study", dtype=date).save()
external_schema = ln.Schema(features=[temperature, date_of_study]).save()

concentration = ln.Feature(name="concentration", dtype=str).save()
donor = ln.Feature(name="donor", dtype=str, nullable=True).save()
schema = ln.Schema(
    features=[concentration, donor],
    slots={"__external__": external_schema},
    otype="DataFrame",
).save()

artifact = ln.Artifact.from_dataframe(
    df,
    key="examples/dataset1.parquet",
    features={"temperature": 21.6, "date_of_study": date(2024, 10, 1)},
    schema=schema,
).save()
artifact.describe()


================================================
FILE: docs/scripts/curate_dataframe_flexible.py
================================================
import lamindb as ln

ln.examples.datasets.mini_immuno.define_features_labels()
df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame")
artifact = ln.Artifact.from_dataframe(
    df, key="examples/dataset1.parquet", schema="valid_features"
).save()
artifact.describe()


================================================
FILE: docs/scripts/curate_dataframe_minimal_errors.py
================================================
import lamindb as ln

schema = ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible()
df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame")
df.pop("donor")  # remove donor column to trigger validation error
try:
    artifact = ln.Artifact.from_dataframe(
        df, key="examples/dataset1.parquet", schema=schema
    ).save()
except ln.errors.ValidationError as error:
    print(error)


================================================
FILE: docs/scripts/curate_dataframe_union_features.py
================================================
import lamindb as ln
import pandas as pd

union_feature = ln.Feature(
    name="mixed_feature",
    dtype="cat[bionty.Tissue.ontology_id|bionty.CellType.ontology_id]",
).save()

df_mixed = pd.DataFrame({"mixed_feature": ["UBERON:0000178", "CL:0000540"]})

schema = ln.Schema(features=[union_feature], coerce=True).save()

curator = ln.curators.DataFrameCurator(df_mixed, schema)
curator.validate()


================================================
FILE: docs/scripts/curate_mudata.py
================================================
import lamindb as ln
import bionty as bt

from docs.scripts.define_schema_df_metadata import study_metadata_schema

# define labels
perturbation = ln.Record(name="Perturbation", is_type=True).save()
ln.Record(name="Perturbed", type=perturbation).save()
ln.Record(name="NT", type=perturbation).save()

replicate = ln.Record(name="Replicate", is_type=True).save()
ln.Record(name="rep1", type=replicate).save()
ln.Record(name="rep2", type=replicate).save()
ln.Record(name="rep3", type=replicate).save()

# define the global obs schema
obs_schema = ln.Schema(
    name="mudata_papalexi21_subset_obs_schema",
    features=[
        ln.Feature(name="perturbation", dtype="cat[Record[Perturbation]]").save(),
        ln.Feature(name="replicate", dtype="cat[Record[Replicate]]").save(),
    ],
).save()

# define the ['rna'].obs schema
obs_schema_rna = ln.Schema(
    name="mudata_papalexi21_subset_rna_obs_schema",
    features=[
        ln.Feature(name="nCount_RNA", dtype=int).save(),
        ln.Feature(name="nFeature_RNA", dtype=int).save(),
        ln.Feature(name="percent.mito", dtype=float).save(),
    ],
).save()

# define the ['hto'].obs schema
obs_schema_hto = ln.Schema(
    name="mudata_papalexi21_subset_hto_obs_schema",
    features=[
        ln.Feature(name="nCount_HTO", dtype=float).save(),
        ln.Feature(name="nFeature_HTO", dtype=int).save(),
        ln.Feature(name="technique", dtype=bt.ExperimentalFactor).save(),
    ],
).save()

# define ['rna'].var schema
var_schema_rna = ln.Schema(
    name="mudata_papalexi21_subset_rna_var_schema",
    itype=bt.Gene.symbol,
    dtype=float,
).save()

# define composite schema
mudata_schema = ln.Schema(
    name="mudata_papalexi21_subset_mudata_schema",
    otype="MuData",
    slots={
        "obs": obs_schema,
        "rna:obs": obs_schema_rna,
        "hto:obs": obs_schema_hto,
        "rna:var": var_schema_rna,
        "uns:study_metadata": study_metadata_schema,
    },
).save()

# curate a MuData
mdata = ln.examples.datasets.mudata_papalexi21_subset(with_uns=True)
bt.settings.organism = "human"  # set the organism to map gene symbols
curator = ln.curators.MuDataCurator(mdata, mudata_schema)
artifact = curator.save_artifact(key="examples/mudata_papalexi21_subset.h5mu")
assert artifact.schema == mudata_schema


================================================
FILE: docs/scripts/curate_soma_experiment.py
================================================
import lamindb as ln
import bionty as bt
import tiledbsoma as soma
import tiledbsoma.io

adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData")
tiledbsoma.io.from_anndata("small_dataset.tiledbsoma", adata, measurement_name="RNA")

obs_schema = ln.Schema(
    name="soma_obs_schema",
    features=[
        ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
        ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
    ],
).save()

var_schema = ln.Schema(
    name="soma_var_schema",
    features=[
        ln.Feature(name="var_id", dtype=bt.Gene.ensembl_gene_id).save(),
    ],
    coerce=True,
).save()

soma_schema = ln.Schema(
    name="soma_experiment_schema",
    otype="tiledbsoma",
    slots={
        "obs": obs_schema,
        "ms:RNA.T": var_schema,
    },
).save()

with soma.Experiment.open("small_dataset.tiledbsoma") as experiment:
    curator = ln.curators.TiledbsomaExperimentCurator(experiment, soma_schema)
    curator.validate()
    artifact = curator.save_artifact(
        key="examples/soma_experiment.tiledbsoma",
        description="SOMA experiment with schema validation",
    )
assert artifact.schema == soma_schema
artifact.describe()


================================================
FILE: docs/scripts/curate_spatialdata.py
================================================
import lamindb as ln

spatialdata = ln.examples.datasets.spatialdata_blobs()
sdata_schema = ln.Schema.get(name="spatialdata_blobs_schema")
curator = ln.curators.SpatialDataCurator(spatialdata, sdata_schema)
try:
    curator.validate()
except ln.errors.ValidationError:
    pass

spatialdata.tables["table"].var.drop(index="ENSG00000999999", inplace=True)

# validate again (must pass now) and save artifact
artifact = ln.Artifact.from_spatialdata(
    spatialdata, key="examples/spatialdata1.zarr", schema=sdata_schema
).save()
artifact.describe()


================================================
FILE: docs/scripts/define_schema_anndata_uns.py
================================================
import lamindb as ln

from define_schema_df_metadata import study_metadata_schema

anndata_uns_schema = ln.Schema(
    otype="AnnData",
    slots={
        "uns:study_metadata": study_metadata_schema,
    },
).save()


================================================
FILE: docs/scripts/define_schema_df_metadata.py
================================================
import lamindb as ln

study_metadata_schema = ln.Schema(
    name="Study metadata schema",
    features=[
        ln.Feature(name="temperature", dtype=float).save(),
        ln.Feature(name="experiment", dtype=str).save(),
    ],
).save()


================================================
FILE: docs/scripts/define_schema_spatialdata.py
================================================
import lamindb as ln
import bionty as bt

# a very comprehensive schema for different slots of a SpatialData object

# define or query features
bio_dict = ln.Feature(name="bio", dtype=dict).save()
tech_dict = ln.Feature(name="tech", dtype=dict).save()
disease = ln.Feature(name="disease", dtype=bt.Disease, coerce=True).save()
developmental_stage = ln.Feature(
    name="developmental_stage",
    dtype=bt.DevelopmentalStage,
    coerce=True,
).save()
assay = ln.Feature(name="assay", dtype=bt.ExperimentalFactor, coerce=True).save()
sample_region = ln.Feature(name="sample_region", dtype=str).save()
analysis = ln.Feature(name="analysis", dtype=str).save()

# define or query schema components
attrs_schema = ln.Schema([bio_dict, tech_dict]).save()
sample_schema = ln.Schema([disease, developmental_stage]).save()
tech_schema = ln.Schema([assay]).save()
obs_schema = ln.Schema([sample_region]).save()
uns_schema = ln.Schema([analysis]).save()
# enforces only registered Ensembl Gene IDs pass validation (maximal_set=True)
varT_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save()

# compose the SpatialData schema
sdata_schema = ln.Schema(
    name="spatialdata_blobs_schema",
    otype="SpatialData",
    slots={
        "attrs:bio": sample_schema,
        "attrs:tech": tech_schema,
        "attrs": attrs_schema,
        "tables:table:obs": obs_schema,
        "tables:table:var.T": varT_schema,
    },
).save()


================================================
FILE: docs/scripts/my_workflow.py
================================================
import lamindb as ln


@ln.flow()
def ingest_dataset(key: str) -> ln.Artifact:
    df = ln.examples.datasets.mini_immuno.get_dataset1()
    artifact = ln.Artifact.from_dataframe(df, key=key).save()
    return artifact


if __name__ == "__main__":
    ingest_dataset(key="my_analysis/dataset.parquet")


================================================
FILE: docs/scripts/my_workflow_with_click.py
================================================
import click
import lamindb as ln


@click.command()
@click.option("--key", required=True)
@ln.flow()
def main(key: str):
    df = ln.examples.datasets.mini_immuno.get_dataset2()
    ln.Artifact.from_dataframe(df, key=key).save()


if __name__ == "__main__":
    main()


================================================
FILE: docs/scripts/my_workflow_with_step.py
================================================
import lamindb as ln


@ln.step()
def subset_dataframe(
    artifact: ln.Artifact,
    subset_rows: int = 2,
    subset_cols: int = 2,
) -> ln.Artifact:
    df = artifact.load()
    new_data = df.iloc[:subset_rows, :subset_cols]
    new_key = artifact.key.replace(".parquet", "_subsetted.parquet")
    return ln.Artifact.from_dataframe(new_data, key=new_key).save()


@ln.flow()
def ingest_dataset(key: str, subset: bool = False) -> ln.Artifact:
    df = ln.examples.datasets.mini_immuno.get_dataset1()
    artifact = ln.Artifact.from_dataframe(df, key=key).save()
    if subset:
        artifact = subset_dataframe(artifact)
    return artifact


if __name__ == "__main__":
    ingest_dataset(key="my_analysis/dataset.parquet", subset=True)


================================================
FILE: docs/scripts/run_script_with_step.py
================================================
import argparse
import lamindb as ln


@ln.step()
def subset_dataframe(
    artifact: ln.Artifact,
    subset_rows: int = 2,
    subset_cols: int = 2,
    run: ln.Run | None = None,
) -> ln.Artifact:
    dataset = artifact.load(is_run_input=run)
    new_data = dataset.iloc[:subset_rows, :subset_cols]
    new_key = artifact.key.replace(".parquet", "_subsetted.parquet")
    return ln.Artifact.from_dataframe(new_data, key=new_key, run=run).save()


if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument("--subset", action="store_true")
    args = p.parse_args()

    params = {"is_subset": args.subset}

    ln.track(params=params)

    if args.subset:
        df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame")
        artifact = ln.Artifact.from_dataframe(
            df, key="my_analysis/dataset.parquet"
        ).save()
        subsetted_artifact = subset_dataframe(artifact)

    ln.finish()


================================================
FILE: docs/scripts/run_track_and_finish.py
================================================
import lamindb as ln

ln.track()  # initiate a tracked notebook/script run

# your code automatically tracks inputs & outputs

ln.finish()  # mark run as finished, save execution report, source code & environment


================================================
FILE: docs/scripts/run_track_with_features_and_params.py
================================================
import argparse
import lamindb as ln


if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument("--s3-folder", type=str)
    p.add_argument("--experiment", type=str)
    args = p.parse_args()
    features = {
        "s3_folder": args.s3_folder,
        "experiment": args.experiment,
    }
    ln.track(features=features, params={"example_param": 42})

    # your code

    ln.finish()


================================================
FILE: docs/scripts/run_track_with_params.py
================================================
import argparse
import lamindb as ln

if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument("--input-dir", type=str)
    p.add_argument("--downsample", action="store_true")
    p.add_argument("--learning-rate", type=float)
    args = p.parse_args()
    params = {
        "input_dir": args.input_dir,
        "learning_rate": args.learning_rate,
        "preprocess_params": {
            "downsample": args.downsample,
            "normalization": "the_good_one",
        },
    }
    ln.track(params=params)

    # your code

    ln.finish()


================================================
FILE: docs/scripts/synced_with_git.py
================================================
import lamindb as ln

ln.settings.sync_git_repo = "https://github.com/..."
ln.track()
# your code
ln.finish()


================================================
FILE: docs/storage/add-replace-cache.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {},
   "source": [
    "# Add, replace, cache and delete artifacts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pytest\n",
    "import shutil\n",
    "import lamindb as ln"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {},
   "outputs": [],
   "source": [
    "ln.setup.login(\"testuser1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {
    "tags": [
     "hide-output",
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "try:\n",
    "    root_path = ln.UPath(\"s3://lamindb-ci/test-add-replace-cache\")\n",
    "    if root_path.exists():\n",
    "        root_path.rmdir()\n",
    "    ln.setup.delete(\"testuser1/test-add-replace-cache\", force=True)\n",
    "except BaseException:  # noqa: S110\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {},
   "outputs": [],
   "source": [
    "ln.setup.init(storage=\"s3://lamindb-ci/test-add-replace-cache\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5",
   "metadata": {},
   "source": [
    "## Save with auto-managed (`key=None`)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {},
   "outputs": [],
   "source": [
    "AUTO_KEY_PREFIX = ln.core.storage.paths.AUTO_KEY_PREFIX\n",
    "root = ln.settings.storage.root"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact = ln.Artifact(\"./test-files/iris.csv\", description=\"iris.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9",
   "metadata": {},
   "outputs": [],
   "source": [
    "key_path = root / f\"{AUTO_KEY_PREFIX}{artifact.uid}{artifact.suffix}\"\n",
    "assert key_path.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {},
   "outputs": [],
   "source": [
    "cache_csv_path = artifact.cache()\n",
    "print(cache_csv_path)\n",
    "assert cache_csv_path.suffix == \".csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.replace(\"./test-files/iris.data\")\n",
    "artifact.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12",
   "metadata": {},
   "outputs": [],
   "source": [
    "old_key_path = key_path\n",
    "new_key_path = root / f\"{AUTO_KEY_PREFIX}{artifact.uid}{artifact.suffix}\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13",
   "metadata": {},
   "source": [
    "The suffix changed:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(old_key_path)\n",
    "print(new_key_path)\n",
    "assert not old_key_path.exists()\n",
    "assert new_key_path.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15",
   "metadata": {},
   "outputs": [],
   "source": [
    "cache_data_path = artifact.cache()\n",
    "print(cache_data_path)\n",
    "assert cache_data_path.suffix == \".data\"\n",
    "assert cache_data_path.stat().st_mtime >= cache_csv_path.stat().st_mtime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.delete(permanent=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17",
   "metadata": {},
   "source": [
    "## Save with manually passed real `key`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18",
   "metadata": {},
   "outputs": [],
   "source": [
    "ln.settings.creation._artifact_use_virtual_keys = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact = ln.Artifact(\"./test-files/iris.csv\", key=\"iris.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21",
   "metadata": {},
   "outputs": [],
   "source": [
    "key_path = root / \"iris.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert key_path.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.replace(\"./test-files/new_iris.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.save()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25",
   "metadata": {},
   "source": [
    "Check paths: no changes here, as the suffix didn't change."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26",
   "metadata": {},
   "outputs": [],
   "source": [
    "old_key_path = key_path\n",
    "new_key_path = root / \"new_iris.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27",
   "metadata": {},
   "outputs": [],
   "source": [
    "old_key_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_key_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert old_key_path.exists()\n",
    "assert not new_key_path.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.replace(\"./test-files/iris.data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_key_path = root / \"iris.data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33",
   "metadata": {},
   "outputs": [],
   "source": [
    "old_key_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_key_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert not old_key_path.exists()\n",
    "assert new_key_path.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.delete(permanent=True, storage=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "37",
   "metadata": {},
   "source": [
    "## Save from memory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39",
   "metadata": {},
   "outputs": [],
   "source": [
    "iris = pd.read_csv(\"./test-files/iris.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact = ln.Artifact.from_dataframe(\n",
    "    iris, description=\"iris_store\", key=\"iris.parquet\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42",
   "metadata": {},
   "outputs": [],
   "source": [
    "key_path = root / \"iris.parquet\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert key_path.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.replace(data=iris[:-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert artifact.key == \"iris.parquet\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert key_path.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.replace(\"./test-files/new_iris.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50",
   "metadata": {},
   "outputs": [],
   "source": [
    "old_key_path = key_path\n",
    "new_key_path = root / \"iris.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51",
   "metadata": {},
   "outputs": [],
   "source": [
    "old_key_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_key_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert not old_key_path.exists()\n",
    "assert new_key_path.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54",
   "metadata": {},
   "outputs": [],
   "source": [
    "# we use the path in the next sections\n",
    "path_in_storage = artifact.path\n",
    "artifact.delete(permanent=True, storage=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55",
   "metadata": {},
   "source": [
    "## Save with manually passed virtual `key`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56",
   "metadata": {},
   "outputs": [],
   "source": [
    "ln.settings.creation._artifact_use_virtual_keys = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact = ln.Artifact(\"./test-files/iris.csv\", key=\"iris.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59",
   "metadata": {},
   "outputs": [],
   "source": [
    "with pytest.raises(ValueError) as err:\n",
    "    artifact.replace(path_in_storage)\n",
    "assert err.exconly().startswith(\n",
    "    \"ValueError: Can only replace with a local path not in any Storage.\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60",
   "metadata": {},
   "outputs": [],
   "source": [
    "# return an existing artifact if the hash is the same\n",
    "assert artifact == artifact.replace(\"./test-files/iris.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61",
   "metadata": {},
   "outputs": [],
   "source": [
    "fpath = artifact.path\n",
    "assert fpath.suffix == \".csv\" and fpath.stem == artifact.uid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.replace(\"./test-files/iris.data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert artifact.key == \"iris.data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert not fpath.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66",
   "metadata": {},
   "outputs": [],
   "source": [
    "fpath = artifact.path\n",
    "assert fpath.suffix == \".data\" and fpath.stem == artifact.uid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.delete(permanent=True, storage=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "68",
   "metadata": {},
   "source": [
    "## Save in existing storage with a virtual `key`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact = ln.Artifact(path_in_storage, key=\"iris_test.csv\").save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert artifact._real_key.endswith(\"iris.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.replace(\"./test-files/iris.data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert artifact._real_key.endswith(\"iris.data\")\n",
    "assert artifact._clear_storagekey.endswith(\"iris.csv\")\n",
    "assert artifact.key == \"iris_test.data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74",
   "metadata": {},
   "outputs": [],
   "source": [
    "path = artifact.path\n",
    "\n",
    "assert path.name == \"iris.data\"\n",
    "assert path.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert not path_in_storage.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.delete(permanent=True, storage=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "77",
   "metadata": {},
   "source": [
    "## Replace with folder artifacts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata = ln.examples.datasets.anndata_pbmc68k_reduced()\n",
    "\n",
    "adata.write_zarr(\"./test-files/pbmc68k.zarr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact = ln.Artifact(\"./test-files/pbmc68k.zarr\", key=\"pbmc68k.zarr\").save()\n",
    "save_hash = artifact.hash\n",
    "save_n_files = artifact.n_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80",
   "metadata": {},
   "outputs": [],
   "source": [
    "with pytest.raises(ValueError) as err:\n",
    "    artifact.replace(\"./test-files/iris.csv\")\n",
    "assert err.exconly().endswith(\"It is not allowed to replace a folder with a file.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert save_hash is not None\n",
    "assert artifact.path.is_dir()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs[\"add_new_col\"] = \"new\"\n",
    "\n",
    "adata.write_zarr(\"./test-files/pbmc68k_new.zarr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.replace(\"./test-files/pbmc68k_new.zarr\")\n",
    "artifact.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert artifact.key == \"pbmc68k.zarr\"\n",
    "assert artifact.hash != save_hash\n",
    "assert artifact.n_files != save_n_files\n",
    "assert artifact.path.is_dir()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85",
   "metadata": {},
   "outputs": [],
   "source": [
    "shutil.rmtree(artifact.cache())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86",
   "metadata": {},
   "outputs": [],
   "source": [
    "with artifact.open() as store:\n",
    "    assert \"add_new_col\" in store.obs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87",
   "metadata": {},
   "outputs": [],
   "source": [
    "# checks that .open above opened the cloud path without syncing\n",
    "assert not artifact._cache_path.exists()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88",
   "metadata": {},
   "outputs": [],
   "source": [
    "shutil.rmtree(\"./test-files/pbmc68k.zarr\")\n",
    "shutil.rmtree(\"./test-files/pbmc68k_new.zarr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89",
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.delete(permanent=True, storage=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "ln.setup.delete(\"test-add-replace-cache\", force=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  },
  "nbproject": {
   "id": "uBQMCcdYwEjA",
   "parent": null,
   "pypackage": null,
   "time_init": "2023-04-04T16:26:17.675023+00:00",
   "user_handle": "Koncopd",
   "user_id": "qTQ5q0ar",
   "user_name": "Sergei Rybakov",
   "version": "0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: docs/storage/anndata-accessor.ipynb
================================================
{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test `AnnDataAccessor`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lamindb as ln\n",
    "\n",
    "ln.setup.init(storage=\"s3://lamindb-ci/test-anndata\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll need some test data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "ln.Artifact(\"s3://lamindb-ci/test-anndata/pbmc68k.h5ad\").save()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "An `h5ad` artifact stored on s3:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact = ln.Artifact.filter(key=\"pbmc68k.h5ad\").one()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata = artifact.open()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It is possible to access `AnnData` attributes without loading them into memory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "print(adata.obsm)\n",
    "print(adata.varm)\n",
    "print(adata.obsp)\n",
    "print(adata.varm)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "However, `.obs`, `.var` and `.uns` are always loaded fully into memory on `AnnDataAccessor` initialization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.uns.keys()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Without subsetting, the `AnnDataAccessor` object gives references to underlying lazy `h5` or `zarr` arrays:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obsm[\"X_pca\"]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And to a lazy `SparseDataset` from the `anndata` package:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obsp[\"distances\"]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get a subset of the object, attributes are loaded only on explicit access:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "obs_idx = adata.obs.cell_type.isin([\"Dendritic cells\", \"CD14+ Monocytes\"]) & (\n",
    "    adata.obs.percent_mito <= 0.05\n",
    ")\n",
    "adata_subset = adata[obs_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_subset"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check shapes of the subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "num_idx = sum(obs_idx)\n",
    "assert adata_subset.shape == (num_idx, adata.shape[1])\n",
    "assert (adata_subset.obs.cell_type == \"CD34+\").sum() == 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_subset.obs.cell_type.value_counts()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Subsets load the arrays into memory only on direct access"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(adata_subset.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(adata_subset.obsm[\"X_pca\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "assert adata_subset.obsp[\"distances\"].shape[0] == num_idx"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To load the entire subset into memory as an actual `AnnData` object, use `to_memory()`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_subset.to_memory()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "!lamin delete --force test-anndata"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  },
  "nbproject": {
   "id": "YVUCtH4GfQOy",
   "parent": null,
   "pypackage": null,
   "time_init": "2023-01-23T08:28:32.097943+00:00",
   "user_handle": "testuser1",
   "user_id": "DzTjkKse",
   "user_name": "Test User1",
   "version": "0"
  },
  "vscode": {
   "interpreter": {
    "hash": "ae1fefc8646a06dd2e75004cd934adda7c5727b046986a772e3b44b0ffba9754"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: docs/storage/prepare-sync-local-to-cloud.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare sync artifacts from a local instance to a cloud instance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!lamin disconnect\n",
    "# need to add pertdb to environment in order to import it\n",
    "!lamin settings modules set bionty,pertdb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lamindb as ln\n",
    "import bionty as bt\n",
    "import pertdb\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ln.setup.init(storage=\"./test-sync-to-cloud\", modules=\"bionty,pertdb\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact = ln.Artifact.from_dataframe(\n",
    "    pd.DataFrame({\"a\": [1, 2, 3]}), description=\"test-sync-to-cloud\"\n",
    ").save()\n",
    "features = bt.CellMarker.from_values(\n",
    "    [\"PD1\", \"CD21\"], field=bt.CellMarker.name, organism=\"human\"\n",
    ").save()\n",
    "artifact.features._add_schema(ln.Schema(features), slot=\"var\")\n",
    "organism = bt.Organism.from_source(name=\"human\").save()\n",
    "artifact.labels.add(organism)\n",
    "compound = pertdb.Compound(name=\"compound-test-sync-to-cloud\").save()\n",
    "artifact.compounds.add(compound)\n",
    "artifact.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert artifact.features.slots[\"var\"].members.count() == 2"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py312",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: docs/storage/sync-local-to-cloud.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sync artifacts from a local instance to a cloud instance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lamindb as ln\n",
    "\n",
    "ln.connect(\"laminlabs/lamin-dev\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "def cleanup(artifact: ln.Artifact):\n",
    "    features_sets = artifact.schemas.all()\n",
    "    compounds = artifact.compounds.all()\n",
    "    artifact.delete(permanent=True, storage=False)\n",
    "    features_sets.delete()\n",
    "    compounds.delete()\n",
    "\n",
    "\n",
    "artifacts = ln.Artifact.filter(description=\"test-sync-to-cloud\")\n",
    "for artifact in artifacts:\n",
    "    cleanup(artifact)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact = ln.Artifact.connect(\"testuser1/test-sync-to-cloud\").get(\n",
    "    description=\"test-sync-to-cloud\"\n",
    ")\n",
    "artifact.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.save(transfer=\"annotations\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "artifact.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert artifact._state.db == \"default\"\n",
    "assert artifact.organisms.get().name == \"human\"\n",
    "assert artifact.compounds.get().name == \"compound-test-sync-to-cloud\"\n",
    "assert artifact.features.slots[\"var\"].members.count() == 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "!rm -r ./test-sync-to-cloud\n",
    "!lamin delete --force test-sync-to-cloud"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py312",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: docs/storage/test-files/iris.data
================================================
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.2,3.4,5.4,2.3,Iris-virginica
5.9,3.0,5.1,1.8,Iris-virginica


================================================
FILE: docs/storage/test_notebooks.py
================================================
from pathlib import Path

import nbproject_test as test

import lamindb as ln


def test_notebooks():
    nbdir = Path(__file__).parent
    ln.setup.login("testuser1")
    test.execute_notebooks(nbdir, write=True)


================================================
FILE: docs/storage/upload.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {},
   "source": [
    "# Track artifacts, in-memory objects & folders [S3 storage]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import lamindb as ln\n",
    "import pytest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {},
   "outputs": [],
   "source": [
    "ln.setup.login(\"testuser1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {
    "tags": [
     "hide-output",
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "try:\n",
    "    root_path = ln.UPath(\"s3://lamindb-ci/test-upload\")\n",
    "    if root_path.exists():\n",
    "        root_path.rmdir()\n",
    "    ln.setup.delete(\"testuser1/test-upload\", force=True)\n",
    "except BaseException:  # noqa: S110\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {},
   "outputs": [],
   "source": [
    "ln.setup.init(storage=\"s3://lamindb-ci/test-upload\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5",
   "metadata": {},
   "source": [
    "## Local artifacts"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6",
   "metadata": {},
   "source": [
    "Some test data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {},
   "outputs": [],
   "source": [
    "pbmc68k = ln.examples.datasets.anndata_pbmc68k_reduced()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8",
   "metadata": {},
   "source": [
    "Subset to a mini artifact to speed up the run time of this notebook:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9",
   "metadata": {},
   "outputs": [],
   "source": [
    "pbmc68k = pbmc68k[:5, :5].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {},
   "outputs": [],
   "source": [
    "pbmc68k"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11",
   "metadata": {},
   "source": [
    "###  Upload from memory using explicit semantic `key`"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12",
   "metadata": {},
   "source": [
    "#### Upload h5ad"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13",
   "metadata": {},
   "outputs": [],
   "source": [
    "pbmc68k_h5ad = ln.Artifact.from_anndata(pbmc68k, key=\"test-upload/pbmc68k.h5ad\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14",
   "metadata": {},
   "outputs": [],
   "source": [
    "pbmc68k_h5ad.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15",
   "metadata": {},
   "outputs": [],
   "source": [
    "pbmc68k_h5ad.delete(permanent=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16",
   "metadata": {},
   "source": [
    "#### Upload zarr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Runs too long, should be tested elsewhere\n",
    "# pbmc68k_zarr = ln.Artifact(pbmc68k, key=\"test-upload/pbmc68k.zarr\", format=\"zarr\")\n",
    "# ln.save(pbmc68k_zarr)\n",
    "# pbmc68k_zarr.delete(permanent=True, storage=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18",
   "metadata": {},
   "source": [
    "### Upload using `id` with implicit `key`"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19",
   "metadata": {},
   "source": [
    "#### Upload h5ad"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20",
   "metadata": {},
   "outputs": [],
   "source": [
    "pbmc68k_h5ad = ln.Artifact.from_anndata(pbmc68k, description=\"pbmc68k.h5ad\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21",
   "metadata": {},
   "outputs": [],
   "source": [
    "pbmc68k_h5ad.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22",
   "metadata": {},
   "outputs": [],
   "source": [
    "pbmc68k_h5ad.delete(permanent=True, storage=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23",
   "metadata": {},
   "source": [
    "#### Upload zarr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Runs too long, should be tested elsewhere\n",
    "# pbmc68k_zarr = ln.Artifact(pbmc68k, name=\"pbmc68k.zarr\", format=\"zarr\")\n",
    "# ln.save(pbmc68k_zarr)\n",
    "# pbmc68k_zarr.delete(permanent=True, storage=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Error behaviors"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "26",
   "metadata": {},
   "source": [
    "Specified file does not exist."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27",
   "metadata": {},
   "outputs": [],
   "source": [
    "with pytest.raises(FileNotFoundError):\n",
    "    non_existent_h5ad = ln.Artifact(\n",
    "        \"s3://lamindb-ci/test-upload/non_existent_file.h5ad\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28",
   "metadata": {},
   "source": [
    "Specified buket does not exist. Normally non-existent bucket raises `FileNotFoundError`, but sometimes strarts to raise `PermissionError`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29",
   "metadata": {},
   "outputs": [],
   "source": [
    "with pytest.raises((FileNotFoundError, PermissionError)):\n",
    "    non_existent_h5ad = ln.Artifact(\n",
    "        \"s3://non_existent_bucket_6612366/non_existent_file.h5ad\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "30",
   "metadata": {},
   "source": [
    "## Test existing zarr"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31",
   "metadata": {},
   "source": [
    "See `test_artifact.py` for other artifact types."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "32",
   "metadata": {},
   "source": [
    "This should probably go elsewhere:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33",
   "metadata": {},
   "outputs": [],
   "source": [
    "# temporarily comment out because of head bucket permission error when\n",
    "# attempting to get region\n",
    "# artifact = ln.Artifact(\"s3://lamindb-ci/lndb-storage/pbmc68k.zarr\")\n",
    "# artifact.save()\n",
    "# artifact.open()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34",
   "metadata": {},
   "outputs": [],
   "source": [
    "ln.setup.delete(\"test-upload\", force=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  },
  "nbproject": {
   "id": "psZgub4FOmzS",
   "parent": null,
   "pypackage": null,
   "time_init": "2023-04-09T20:01:57.780053+00:00",
   "user_handle": "testuser1",
   "user_id": "DzTjkKse",
   "user_name": "Test User1",
   "version": "0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: docs/storage/vitessce.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Vitessce integration"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For more comprehensive integration tests, see: https://github.com/laminlabs/lamin-spatial"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!lamin login testuser1\n",
    "!lamin init --storage \"s3://lamindb-ci/test-vitessce\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lamindb as ln\n",
    "import pytest\n",
    "from vitessce import (\n",
    "    VitessceConfig,\n",
    "    AnnDataWrapper,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set up test data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pbmc68k = ln.examples.datasets.anndata_pbmc68k_reduced()[:100, :200].copy()\n",
    "zarr_filepath = \"my_test.zarr\"\n",
    "# write the anndata to a local zarr path\n",
    "pbmc68k.write_zarr(zarr_filepath)\n",
    "# create an artifact from the path\n",
    "dataset_artifact = ln.Artifact(zarr_filepath, description=\"Test dataset\").save()\n",
    "# this is the where the zarr folder is located on a public S3 bucket\n",
    "dataset_artifact.path.to_url()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a `VitessceConfig` object: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "hide-output"
    ]
   },
   "outputs": [],
   "source": [
    "vc = VitessceConfig(schema_version=\"1.0.15\")\n",
    "vc.add_dataset(name=\"test1\").add_object(\n",
    "    AnnDataWrapper(\n",
    "        adata_artifact=dataset_artifact,\n",
    "        obs_embedding_paths=[\"obsm/X_umap\"],\n",
    "    ),\n",
    ")\n",
    "vc.to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vitessce_config_artifact = ln.integrations.save_vitessce_config(\n",
    "    vc, description=\"View testdata in Vitessce\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# different equivalent ways of testing that the action is attached\n",
    "assert dataset_artifact._actions.get() == vitessce_config_artifact\n",
    "assert vitessce_config_artifact._action_targets.get() == dataset_artifact\n",
    "assert vitessce_config_artifact._actions.first() is None\n",
    "assert vitessce_config_artifact.kind == \"__lamindb_config__\"\n",
    "assert ln.Artifact.get(_actions=vitessce_config_artifact) == dataset_artifact"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_artifact.delete(permanent=True)\n",
    "vitessce_config_artifact.delete(permanent=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Test validation within `save_vitessce_config`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pass an artifact URL instead of the artifact object itself\n",
    "vc = VitessceConfig(schema_version=\"1.0.15\")\n",
    "with pytest.raises(AttributeError) as error:\n",
    "    vc.add_dataset(name=\"test1\").add_object(\n",
    "        AnnDataWrapper(\n",
    "            adata_artifact=dataset_artifact.path.to_url(),\n",
    "            obs_embedding_paths=[\"obsm/X_umap\"],\n",
    "        ),\n",
    "    )\n",
    "print(error.exconly())\n",
    "assert error.exconly().startswith(\n",
    "    \"AttributeError: 'str' object has no attribute 'path'\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf test-vitessce\n",
    "!lamin delete --force test-vitessce"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py312",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: docs/storage.md
================================================
# Storage

```{toctree}
:maxdepth: 1

storage/upload
storage/add-replace-cache
storage/anndata-accessor
storage/prepare-sync-local-to-cloud
storage/sync-local-to-cloud
storage/vitessce
```


================================================
FILE: docs/sync.md
================================================
---
execute_via: python
---

# Sync data across databases

This guide shows how to sync objects from a source database to your default database.

We need a target database:

```python
!lamin init --storage ./test-sync --modules bionty
```

Import `lamindb` and optionally run `ln.track()`:

```python
import lamindb as ln

ln.track()
```

Syncing works for any object type (`Artifact`, `Record`, `Transform`, `ULabel`, etc.). Let's sync an artifact to our current default database:

```python
db = ln.DB("laminlabs/lamindata")
# query the artifact on the source database
artifact = db.Artifact.get(key="example_datasets/mini_immuno/dataset1.h5ad")
# sync the artifact to the current database
artifact.save()
```

If you also want to sync feature & label annotations, pass `transfer="annotations"`:

```python
# query again so that `artifact` holds the object on the source database
artifact = db.Artifact.get(key="example_datasets/mini_immuno/dataset1.h5ad")
# sync the artifact to the current database, including transfer of annotations where necessary
artifact.save(transfer="annotations")
```

The artifact now has all feature & label annotations:

```python
artifact.describe()
```

The sync is zero-copy, which means that the data itself remained in the original storage location:

```python
artifact.path
```

Data lineage indicates the source database of the sync:

```python
artifact.view_lineage()
```

The run that initiated the sync is linked via `initiated_by_run`:

```python
artifact.run.initiated_by_run.transform
```

Upon calling `.save()` again, `lamindb` identifies that the object already exists in the target database and simply maps it:

```python
artifact = db.Artifact.get(key="example_datasets/mini_immuno/dataset1.h5ad")
artifact.save()
```

```{dropdown} How do I know if an object is in the default database or elsewhere?

Every `SQLRecord` object has an attribute `._state.db` which can take the following values:

- `None`: the object has not yet been saved to any database
- `"default"`: the object is saved on the default database instance
- `"account/name"`: the object is saved on a non-default database instance referenced by `account/name` (e.g., `laminlabs/lamindata`)

```

```python tags=["hide-cell"]
# test the last 3 cells here
assert artifact.transform.description == "Transfer from `laminlabs/lamindata`"
assert artifact.transform.key == "__lamindb_transfer__/4XIuR0tvaiXM"
assert artifact.transform.uid == "4XIuR0tvaiXM0000"
assert artifact.run.initiated_by_run.transform.description.startswith("Sync data")
```


================================================
FILE: docs/test_notebooks.py
================================================
import sys
from pathlib import Path

import nbproject_test as test

sys.path[:0] = [str(Path(__file__).parent.parent)]

from noxfile import GROUPS

DOCS = Path(__file__).parents[1] / "docs/"


def test_tutorial():
    for artifactname in GROUPS["tutorial"]:
        test.execute_notebooks(DOCS / artifactname, write=True)


def test_guide():
    for artifactname in GROUPS["guide"]:
        test.execute_notebooks(DOCS / artifactname, write=True)


def test_tiledbsoma():
    for artifactname in GROUPS["tiledbsoma"]:
        test.execute_notebooks(DOCS / artifactname, write=True)


def test_biology():
    for artifactname in GROUPS["biology"]:
        test.execute_notebooks(DOCS / artifactname, write=True)


================================================
FILE: docs/track.md
================================================
---
execute_via: python
---

# Track notebooks, scripts & workflows

This guide walks from tracking data lineage in a notebook to tracking parameters in workflows.

```{raw} html
<iframe width="560" height="315" src="https://www.youtube.com/embed/jwnHu1PbA9Q?si=Eqn4dBZyFDrbcxvm" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
```

**Note:** To run examples, if you don't have a `lamindb` instance, create one:

```python
!lamin init --storage ./test-track
```

## Manage notebooks and scripts

Call {meth}`~lamindb.track` to save your notebook or script as a `transform` and start tracking inputs & outputs of a run.

```{eval-rst}
.. literalinclude:: scripts/run_track_and_finish.py
   :language: python
```

<!-- #region -->

You find your notebooks and scripts in the {class}`~lamindb.Transform` registry along with pipelines & functions:

```python
transform = ln.Transform.get(key="my_analyses/my_notebook.ipynb")
transform.source_code             # source code
transform.runs.to_dataframe()     # all runs in a dataframe
transform.latest_run.report       # report of latest run
transform.latest_run.environment  # environment of latest run
```

<!-- #endregion -->

<!-- #region -->

You can use the CLI to load a transform into your current (development) directory:

```bash
lamin load --key my_analyses/my_notebook.ipynb
```

<!-- #endregion -->

<!-- #region -->

Here is how you'd load the [notebook from the video](https://lamin.ai/laminlabs/lamindata/transform/F4L3oC6QsZvQ) into your local directory:

```bash
lamin load https://lamin.ai/laminlabs/lamindata/transform/F4L3oC6QsZvQ
```

<!-- #endregion -->

(sync-code-with-git)=

### Organize local development

<!-- #region -->

If no development directory is set, script & notebook keys equal their filenames.
Otherwise, they represent the relative path in the development directory.
The exception is packaged source code, whose keys have the form `pypackages/{package_name}/path/to/file.py`.

To set the development directory to your current shell development directory, run:

```bash
lamin settings set dev-dir .
```

You can see the current status by running:

```bash
lamin info
```

When you `cd` into that directory, you will now auto-connect to the configured lamindb instance.

To sync scripts or workflows with their correponding files in a git repo, either export an environment variable:

```shell
export LAMINDB_SYNC_GIT_REPO = <YOUR-GIT-REPO-URL>
```

Or set the following setting:

```python
ln.settings.sync_git_repo = <YOUR-GIT-REPO-URL>
```

If you work on a single project in your lamindb instance, it makes sense to set LaminDB's `dev-dir` to the root of the local git repo clone.

```bash
dbs/
  project1/
    .git/
    .lamin/
    script1.py
    notebook1.ipynb
  ...
```

If you work on multiple projects in your lamindb instance, you can use the `dev-dir` as the local root and nest git repositories in it.

```bash
dbs/
  database1/
    .lamin/
    repo1/
      .git/
    repo2/
      .git/
  ...
```

<!-- #endregion -->

### Use projects

You can link the entities created during a run to a project.

```python
import lamindb as ln

my_project = ln.Project(name="My project").save()  # create & save a project
ln.track(project="My project")  # pass project
open("sample.fasta", "w").write(">seq1\nACGT\n")  # create a dataset
ln.Artifact("sample.fasta", key="sample.fasta").save()  # auto-labeled by project
```

Filter entities by project, e.g., artifacts:

```python
ln.Artifact.filter(projects=my_project).to_dataframe()
```

Access entities linked to a project:

```python
my_project.artifacts.to_dataframe()
```

The same works for `my_project.transforms` or `my_project.runs`.

### Use spaces

You can write the entities created during a run into a space that you configure on LaminHub. This is particularly useful if you want to restrict access to a space. Note that this doesn't affect bionty entities who should typically be commonly accessible.

<!-- #region -->

```python
ln.track(space="Our team space")
```

<!-- #endregion -->

### Track agent plans

<!-- #region -->

Saving an agent plan automatically tags with `artifact.kind = "plan"` and infers a `key` starting with `.plans/`:

```bash
lamin save /path/to/.cursor/plans/my_task.plan.md
lamin save /path/to/.claude/plans/my_task.md
```

Link an agent plan against a run:

```python
ln.track(plan=".plans/my-agent-plan.md")
```

This links the `plan` artifact to a run in the same way as `transform`, an initiating run (`initiated_by_run`), and `report` / `environment` artifacts are linked to the run.

While `transform` acts as the deterministic source code for the run and `initiated_by_run` enables higher-level runs in workflow orchestration, the agent `plan` complements these by linking a plan that steers a non-deterministic agent.

<!-- #endregion -->

(manage-workflows)=

## Manage workflows

Here we'll manage workflows with `lamindb`'s {func}`~lamindb.flow` and {func}`~lamindb.step` decorators, which works out-of-the-box with the majority of Python workflow managers:

| tool      | workflow decorator | step/task decorator | notes                                          |
| --------- | ------------------ | ------------------- | ---------------------------------------------- |
| `lamindb` | `@flow`            | `@step`             | inspired by `prefect`                          |
| `prefect` | `@flow`            | `@task`             | two decorators                                 |
| `redun`   | `@task` (on main)  | `@task`             | single decorator for everything                |
| `dagster` | `@job` or `@asset` | `@op` or `@asset`   | asset-centric; `@asset` is primary             |
| `flyte`   | `@workflow`        | `@task`             | also `@dynamic` for runtime DAGs               |
| `airflow` | `@dag`             | `@task`             | TaskFlow API (modern); also supports operators |
| `zenml`   | `@pipeline`        | `@step`             | inspired by `prefect`                          |

If you're looking for more in-depth examples or for integrating with non-decorator-based workflow managers such as Nextflow or Snakemake, see {doc}`docs:pipelines`.

| tool        | workflow           | step/task         | notes            |
| ----------- | ------------------ | ----------------- | ---------------- |
| `nextflow`  | `workflow` keyword | `process` keyword | groovy-based DSL |
| `snakemake` | `rule` keyword     | `rule` keyword    | file-based DSL   |
| `metaflow`  | `FlowSpec`         | `@step`           | class-based      |
| `kedro`     | `Pipeline()`       | `node()`          | function-based   |

### A one-step workflow

Decorate a function with {func}`~lamindb.flow` to track it as a workflow:

```{eval-rst}
.. literalinclude:: scripts/my_workflow.py
   :language: python
   :caption: my_workflow.py
```

Let's run the workflow:

```python
!python scripts/my_workflow.py
```

Query the workflow via its filename:

```python
transform = ln.Transform.get(key="my_workflow.py")
transform.describe()
```

The run stored the parameter value for `key`:

```python
transform.latest_run.describe()
```

It links output artifacts:

```python
transform.latest_run.output_artifacts.to_dataframe()
```

You can query for all runs that ran with that parameter:

```python
ln.Run.filter(
    params__key="my_analysis/dataset.parquet",
).to_dataframe()
```

You can also pass complex parameters and features, see: {ref}`track-run-parameters`.

### A multi-step workflow

Here, the workflow calls an additional processing step:

```{eval-rst}
.. literalinclude:: scripts/my_workflow_with_step.py
   :language: python
   :caption: my_workflow_with_step.py
```

Let's run the workflow:

```python
!python scripts/my_workflow_with_step.py
```

The lineage of the subsetted artifact resolves the subsetting step:

```python
subsetted_artifact = ln.Artifact.get(key="my_analysis/dataset_subsetted.parquet")
subsetted_artifact.view_lineage()
```

This is the run that created the subsetted_artifact:

```python
subsetted_artifact.run
```

This is the initating run that triggered the function call:

```python
subsetted_artifact.run.initiated_by_run
```

These are the parameters of the run:

```python
subsetted_artifact.run.params
```

These are the input artifacts:

```python
subsetted_artifact.run.input_artifacts.to_dataframe()
```

These are output artifacts:

```python
subsetted_artifact.run.output_artifacts.to_dataframe()
```

### A workflow with CLI arguments

Let's use `click` to parse CLI arguments:

```{eval-rst}
.. literalinclude:: scripts/my_workflow_with_click.py
   :language: python
   :caption: my_workflow_with_click.py
```

Let's run the workflow:

```python
!python scripts/my_workflow_with_click.py --key my_analysis/dataset2.parquet
```

CLI arguments are tracked and accessible via `run.cli_args`:

```python
run = ln.Run.filter(transform__key="my_workflow_with_click.py").first()
run.describe()
```

Note that it doesn't matter whether you use `click`, `argparse`, or any other CLI argument parser.

(track-run-parameters)=

## Track parameters & features

We just saw that the function decorators `@ln.flow()` and `@ln.step()` track parameter values automatically. Here is how to pass parameters to `ln.track()`:

```{eval-rst}
.. literalinclude:: scripts/run_track_with_params.py
   :language: python
   :caption: run_track_with_params.py
```

Run the script.

```python
!python scripts/run_track_with_params.py  --input-dir ./mydataset --learning-rate 0.01 --downsample
```

Query for all runs that match certain parameters:

```python
ln.Run.filter(
    params__learning_rate=0.01,
    params__preprocess_params__downsample=True,
).to_dataframe()
```

Describe & get parameters:

```python
run = ln.Run.filter(params__learning_rate=0.01).order_by("-started_at").first()
run.describe()
run.params
```

You can also access the CLI arguments used to start the run directly:

```python
run.cli_args
```

You can also track run features in analogy to artifact features.

In contrast to params, features are validated against the `Feature` registry and allow to express relationships with entities in your registries.

Let's first define labels & features.

```python
experiment_type = ln.Record(name="Experiment", is_type=True).save()
experiment_label = ln.Record(name="Experiment1", type=experiment_type).save()
ln.Feature(name="s3_folder", dtype=str).save()
ln.Feature(name="experiment", dtype=experiment_type).save()
```

```python
!python scripts/run_track_with_features_and_params.py  --s3-folder s3://my-bucket/my-folder --experiment Experiment1
```

```python
ln.Run.filter(s3_folder="s3://my-bucket/my-folder").to_dataframe()
```

Describe & get feature values.

```python
run2 = ln.Run.filter(
    s3_folder="s3://my-bucket/my-folder", experiment="Experiment1"
).last()
run2.describe()
run2.features.get_values()
```

## Manage functions in scripts and notebooks

If you want more-fined-grained data lineage tracking in a script or notebook where you called `ln.track()`, you can also use the `step()` decorator.

### In a notebook

```python
@ln.step()
def subset_dataframe(
    input_artifact_key: str,
    output_artifact_key: str,
    subset_rows: int = 2,
    subset_cols: int = 2,
) -> None:
    artifact = ln.Artifact.get(key=input_artifact_key)
    dataset = artifact.load()
    new_data = dataset.iloc[:subset_rows, :subset_cols]
    ln.Artifact.from_dataframe(new_data, key=output_artifact_key).save()
```

Prepare a test dataset:

```python
df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame")
input_artifact_key = "my_analysis/dataset.parquet"
artifact = ln.Artifact.from_dataframe(df, key=input_artifact_key).save()
```

Run the function with default params:

```python
ouput_artifact_key = input_artifact_key.replace(".parquet", "_subsetted.parquet")
subset_dataframe(input_artifact_key, ouput_artifact_key, subset_rows=1)
```

Query for the output:

```python
subsetted_artifact = ln.Artifact.get(key=ouput_artifact_key)
subsetted_artifact.view_lineage()
```

Re-run the function with a different parameter:

```python
subsetted_artifact = subset_dataframe(
    input_artifact_key, ouput_artifact_key, subset_cols=3
)
subsetted_artifact = ln.Artifact.get(key=ouput_artifact_key)
subsetted_artifact.view_lineage()
```

We created a new run:

```python
subsetted_artifact.run
```

With new parameters:

```python
subsetted_artifact.run.params
```

And a new version of the output artifact:

```python
subsetted_artifact.run.output_artifacts.to_dataframe()
```

### In a script

```{eval-rst}
.. literalinclude:: scripts/run_script_with_step.py
   :language: python
   :caption: run_script_with_step.py
```

```python
!python scripts/run_script_with_step.py --subset
```

```python
ln.view()
```

## The database

See the state of the database after we ran these different examples:

```python
ln.view()
```

## Using transform versions as templates

<!-- #region -->

A transform acts like a template upon using `lamin load` to load it. Consider you run:

```bash
lamin load https://lamin.ai/account/instance/transform/Akd7gx7Y9oVO0000
```

Upon running the returned notebook or script, you'll automatically create a new version and be able to browse it via the version dropdown on the UI.

Additionally, you can:

- label using `ULabel` or `Record`, e.g., `transform.records.add(template_label)`
- tag with an indicative `version` string, e.g., `transform.version = "T1"; transform.save()`
<!-- #endregion -->

<!-- #region -->

:::{dropdown} Saving a notebook as an artifact

Sometimes you might want to save a notebook as an artifact. This is how you can do it:

```bash
lamin save template1.ipynb --key templates/template1.ipynb --description "Template for analysis type 1" --registry artifact
```

:::

<!-- #endregion -->

A few checks at the end of this notebook:

```python
assert run.params == {
    "input_dir": "./mydataset",
    "learning_rate": 0.01,
    "preprocess_params": {"downsample": True, "normalization": "the_good_one"},
}, run.params
assert my_project.artifacts.exists()
assert my_project.transforms.exists()
assert my_project.runs.exists()
```


================================================
FILE: lamindb/__init__.py
================================================
"""A data framework for biology.

Installation::

   pip install lamindb

If you just want to *read* data from a LaminDB instance, use :class:`~lamindb.DB`::

   import lamindb as ln

   db = ln.DB("laminlabs/cellxgene")

To *write* data, connect to a writable instance::

   lamin login
   lamin connect account/name

You can create an instance at `lamin.ai <https://lamin.ai>`__ and invite collaborators.
If you prefer to work with a local database (no login required), run::

    lamin init --storage ./quickstart-data --modules bionty

LaminDB will then auto-connect upon import and you can then create & save objects like this::

   import lamindb as ln
   # → connected lamindb: account/instance

   ln.Artifact("./my_dataset.parquet", key="datasets/my_dataset.parquet").save()

Lineage
=======

Track inputs, outputs, parameters, and environments of notebooks, scripts, and functions.

.. autosummary::
   :toctree: .

   track
   finish
   flow
   step

Artifacts
=========

The central `Artifact` registry holds files, folders & arrays across any number of storage locations.

.. autosummary::
   :toctree: .

   Artifact

All other registries link to `Artifact` to provide context for finding, querying, validating, and managing artifacts.
Here is an overview of the core data model:

.. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/HMfWLa1rFkxcxQEN0000.svg
    :width: 800px


Transforms & runs
=================

Data transformations and their executions.

.. autosummary::
   :toctree: .

   Transform
   Run

Records, labels, features & schemas
===================================

Create labels and manage flexible records, e.g., for samples or donors.

.. autosummary::
   :toctree: .

   Record
   ULabel

Define features & schemas to validate artifacts & records.

.. autosummary::
   :toctree: .

   Feature
   Schema

Managing operations
===================

.. autosummary::
   :toctree: .

   Project
   Storage
   User
   Branch
   Space
   Collection
   Reference

Basic utilities
===============

Connecting, viewing database content, accessing settings & run context.

.. autosummary::
   :toctree: .

   DB
   connect
   view
   save
   UPath
   settings
   context

Curators and integrations
=========================

.. autosummary::
   :toctree: .

   curators
   integrations

Examples, errors & setup
========================

.. autosummary::
   :toctree: .

   examples
   errors
   setup

Developer API
=============

.. autosummary::
   :toctree: .

   base
   core
   models

"""

# ruff: noqa: I001
# denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
__version__ = "2.4.2"

import warnings as _warnings

# through SpatialData
_warnings.filterwarnings(
    "ignore", message="The legacy Dask DataFrame implementation is deprecated"
)

from lamindb_setup._check_setup import _check_instance_setup
from lamindb_setup._connect_instance import connect
from lamindb_setup.core.upath import UPath

from . import base, errors, setup

_check_instance_setup(from_module="lamindb")

from .core._functions import flow, step, tracked
from ._view import view
from .core._context import context
from .core._settings import settings
from .models import (
    Artifact,
    Collection,
    Feature,
    Project,
    Reference,
    Run,
    Schema,
    Storage,
    Transform,
    ULabel,
    User,
    Space,
    Branch,
    Record,
    DB,
)
from .models.save import save
from . import core
from . import integrations
from . import curators
from . import examples

track = context._track
finish = context._finish
settings.__doc__ = """Global live settings (:class:`~lamindb.core.Settings`)."""
context.__doc__ = """Global run context (:class:`~lamindb.core.Context`)."""

from django.db.models import Q

Param = Feature  # backward compat

__all__ = [
    # data lineage
    "track",
    "finish",
    "step",
    "flow",
    # registries
    "Artifact",
    "Storage",
    "Transform",
    "Run",
    "Feature",
    "ULabel",
    "Schema",
    "Record",
    "User",
    "Collection",
    "Project",
    "Space",
    "Branch",
    "Reference",
    # other
    "connect",
    "view",
    "save",
    "UPath",
    "settings",
    "context",
    "DB",
    # curators and integrations
    "curators",
    "integrations",
    # examples, errors, setup
    "examples",
    "errors",
    "setup",
    # low-level functionality
    "base",
    "core",
    "models",
]


================================================
FILE: lamindb/_finish.py
================================================
from __future__ import annotations

import builtins
import re
from datetime import datetime, timezone
from time import sleep
from typing import TYPE_CHECKING

import lamindb_setup as ln_setup
from lamin_utils import logger
from lamin_utils._logger import LEVEL_TO_COLORS, LEVEL_TO_ICONS, RESET_COLOR
from lamindb_setup.core.hashing import hash_dir, hash_file

from lamindb.models import Artifact, Run, Transform

is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)

if TYPE_CHECKING:
    from pathlib import Path


def get_save_notebook_message() -> str:
    # do not add bold() or any other complicated characters as then we can't match this
    # easily anymore in an html to strip it out
    return f"please hit {get_shortcut()} to save the notebook in your editor"


def get_save_notebook_message_retry() -> str:
    return f"{get_save_notebook_message()} and re-run finish()"


# this code was originally in nbproject by the same authors
def check_consecutiveness(
    nb, calling_statement: str = None, silent_success: bool = True
) -> bool:
    """Check whether code cells have been executed consecutively.

    Needs to be called in the last code cell of a notebook.
    Otherwise raises `RuntimeError`.

    Returns cell transitions that violate execution at increments of 1 as a list
    of tuples.

    Args:
        nb: Notebook content.
        calling_statement: The statement that calls this function.
    """
    cells = nb.cells

    violations = []
    prev = 0

    ccount = 0  # need to initialize because notebook might note have code cells
    # and below, we check if ccount is None
    for cell in cells:
        cell_source = "".join(cell["source"])
        if cell["cell_type"] != "code" or cell_source == "":
            continue

        if calling_statement is not None and calling_statement in cell_source:
            continue

        ccount = cell["execution_count"]
        if ccount is None or prev is None or ccount - prev != 1:
            violations.append((prev, ccount))

        prev = ccount

    # ignore the very last code cell of the notebook
    # `check_consecutiveness` is being run during publish if `last_cell`` is True
    # hence, that cell has ccount is None
    if ccount is None:
        violations.pop()

    any_violations = len(violations) > 0
    if any_violations:
        logger.warning(f"cells {violations} were not run consecutively")
    elif not silent_success:
        logger.success("cell execution numbers increase consecutively")

    return not any_violations


def get_shortcut() -> str:
    import platform

    return "CMD + s" if platform.system() == "Darwin" else "CTRL + s"


def get_seconds_since_modified(filepath) -> float:
    return datetime.now().timestamp() - filepath.stat().st_mtime


def save_run_logs(run: Run, save_run: bool = False) -> None:
    logs_path = ln_setup.settings.cache_dir / f"run_logs_{run.uid}.txt"
    if logs_path.exists():
        if run.report is not None:
            logger.important("overwriting run.report")
        artifact = Artifact(  # type: ignore
            logs_path,
            description=f"log streams of run {run.uid}",
            kind="__lamindb_run__",
            run=False,
        )
        artifact.save(upload=True, print_progress=False)
        run.report = artifact
        if save_run:  # defaults to false because is slow
            run.save()


# this is from the get_title function in nbproject
# should be moved into lamindb sooner or later
def prepare_notebook(
    nb,
    strip_title: bool = False,
) -> str | None:
    title_found = False
    for cell in nb.cells:
        cell.metadata.clear()  # strip cell metadata
        if not title_found and cell["cell_type"] == "markdown":
            lines = cell["source"].split("\n")
            for i, line in enumerate(lines):
                if line.startswith("# "):
                    line.lstrip("#").strip(" .").strip()
                    title_found = True
                    if strip_title:
                        lines.pop(i)
                        cell["source"] = "\n".join(lines)
        # strip logging message about saving notebook in editor
        # this is normally the last cell
        if cell["cell_type"] == "code" and ".finish(" in cell["source"]:
            for output in cell["outputs"]:
                if "to save the notebook in your editor" in output.get("text", ""):
                    cell["outputs"] = []
                    break
    return None


def notebook_to_report(notebook_path: Path, output_path: Path) -> None:
    import nbformat
    import traitlets.config as config
    from nbconvert import HTMLExporter

    with open(notebook_path, encoding="utf-8") as f:
        notebook = nbformat.read(f, as_version=4)
    prepare_notebook(notebook, strip_title=True)
    notebook.metadata.clear()  # strip notebook metadata
    # if we were to export as ipynb, the following two lines would do it
    # with open(output_path, "w", encoding="utf-8") as f:
    #     nbformat.write(notebook, f)
    # instead we need all this code
    c = config.Config()
    c.HTMLExporter.preprocessors = []
    c.HTMLExporter.exclude_input_prompt = True
    c.HTMLExporter.exclude_output_prompt = True
    c.HTMLExporter.anchor_link_text = " "
    html_exporter = HTMLExporter(config=c)
    html, _ = html_exporter.from_notebook_node(notebook)
    output_path.write_text(html, encoding="utf-8")


def notebook_to_script(  # type: ignore
    title: str, notebook_path: Path, script_path: Path | None = None
) -> None | str:
    import jupytext

    notebook = jupytext.read(notebook_path)
    notebook.metadata.clear()
    py_content = jupytext.writes(notebook, fmt="py:percent")
    # remove global metadata header
    py_content = re.sub(r"^# ---\n.*?# ---\n\n", "", py_content, flags=re.DOTALL)
    # replace title
    py_content = py_content.replace(f"# # {title}", "#")
    if script_path is None:
        return py_content
    else:
        script_path.write_text(py_content, encoding="utf-8")


def clean_r_notebook_html(file_path: Path) -> tuple[str | None, Path]:
    import re

    cleaned_content = file_path.read_text()
    # remove title from content
    pattern_title = r"<title>(.*?)</title>"
    title_match = re.search(pattern_title, cleaned_content)
    title_text = None
    if title_match:
        title_text = title_match.group(1)
        pattern_h1 = f"<h1[^>]*>{re.escape(title_text)}</h1>"
        cleaned_content = re.sub(pattern_title, "", cleaned_content)
        cleaned_content = re.sub(pattern_h1, "", cleaned_content)
    # remove error message from content
    if "to save the notebook in your editor" in cleaned_content:
        orig_error_message = f"! {get_save_notebook_message_retry()}"
        # coming up with the regex for this is a bit tricky due to all the
        # escape characters we'd need to insert into the message; hence,
        # we do this with a replace() instead
        cleaned_content = cleaned_content.replace(orig_error_message, "")
        if "to save the notebook in your editor" in cleaned_content:
            orig_error_message = orig_error_message.replace(
                " finish()", "\nfinish()"
            )  # RStudio might insert a newline
            cleaned_content = cleaned_content.replace(orig_error_message, "")
    cleaned_path = file_path.parent / (f"{file_path.stem}.cleaned{file_path.suffix}")
    cleaned_path.write_text(cleaned_content, encoding="utf-8")
    return title_text, cleaned_path


def check_filepath_recently_saved(filepath: Path, is_finish_retry: bool) -> bool:
    # the recently_saved_time needs to be very low for the first check
    # because an accidental save (e.g. via auto-save) might otherwise lead
    # to upload of an outdated notebook
    # also see implementation for R notebooks below
    offset_saved_time = 0.3 if not is_finish_retry else 20
    for retry in range(30):
        recently_saved_time = offset_saved_time + retry  # sleep time is 1 sec
        if get_seconds_since_modified(filepath) > recently_saved_time:
            if retry == 0:
                prefix = f"{LEVEL_TO_COLORS[20]}{LEVEL_TO_ICONS[20]}{RESET_COLOR}"
                print(f"{prefix} {get_save_notebook_message()}", end=" ")
            elif retry == 9:
                print(".", end="\n")
            elif retry == 4:
                print(". still waiting ", end="")
            else:
                print(".", end="")
            sleep(1)
        else:
            if retry > 0:
                prefix = f"{LEVEL_TO_COLORS[25]}{LEVEL_TO_ICONS[25]}{RESET_COLOR}"
                print(f" {prefix}")
            # filepath was recently saved, return True
            return True
    # if we arrive here, no save event occured, return False
    return False


def save_context_core(
    *,
    run: Run | None,
    transform: Transform,
    filepath: Path,
    finished_at: bool = False,
    skip_save_report: bool = False,
    ignore_non_consecutive: bool | None = None,
    from_cli: bool = False,
    is_retry: bool = False,
    notebook_runner: str | None = None,
    message_prefix: str = "go to",
) -> str | None:
    import lamindb as ln
    from lamindb.models import (
        format_field_value,  # needs to come after lamindb was imported because of CLI use
    )

    ln.settings.verbosity = "success"

    # for scripts, things are easy
    is_consecutive = True
    is_ipynb = filepath.suffix == ".ipynb"
    is_r_notebook = filepath.suffix in {".qmd", ".Rmd"}
    source_code_path = filepath
    report_path: Path | None = None
    save_source_code_and_report = filepath.exists()
    if (
        is_run_from_ipython and notebook_runner != "nbconvert" and filepath.exists()
    ):  # python notebooks in interactive session
        if is_ipynb:
            # ignore this for py:percent notebooks
            import nbproject

            # it might be that the user modifies the title just before ln.finish()
            if (nbproject_title := nbproject.meta.live.title) != transform.description:
                transform.description = nbproject_title
                transform.save()
        if not ln_setup._TESTING:
            save_source_code_and_report = check_filepath_recently_saved(
                filepath, is_retry
            )
            if not save_source_code_and_report and not is_retry:
                logger.warning(get_save_notebook_message_retry())
                return "retry"
            elif not save_source_code_and_report:
                logger.warning(
                    "the notebook on disk wasn't saved within the last 10 sec"
                )
    if is_ipynb and filepath.exists():  # could be from CLI outside interactive session
        try:
            import jupytext  # noqa: F401
            from nbproject.dev import (
                read_notebook,
            )
        except ImportError:
            logger.error("install nbproject & jupytext: pip install nbproject jupytext")
            return None
        notebook_content = read_notebook(filepath)  # type: ignore
        if not ignore_non_consecutive:  # ignore_non_consecutive is None or False
            is_consecutive = check_consecutiveness(
                notebook_content, calling_statement=".finish("
            )
            if not is_consecutive:
                response = "n"  # ignore_non_consecutive == False
                if ignore_non_consecutive is None:  # only print warning
                    response = "y"  # we already printed the warning
                else:  # ask user to confirm
                    response = input(
                        "   Do you still want to proceed with finishing? (y/n) "
                    )
                if response != "y":
                    return "aborted-non-consecutive"
        # write the report
        report_path = ln_setup.settings.cache_dir / filepath.name.replace(
            ".ipynb", ".html"
        )
        notebook_to_report(filepath, report_path)
        # write the source code
        source_code_path = ln_setup.settings.cache_dir / filepath.name.replace(
            ".ipynb", ".py"
        )
        notebook_to_script(transform.description, filepath, source_code_path)
    elif is_ipynb and not filepath.exists():
        logger.warning("notebook file does not exist in compute environment")
    elif is_r_notebook:
        if filepath.with_suffix(".nb.html").exists():
            report_path = filepath.with_suffix(".nb.html")
        elif filepath.with_suffix(".html").exists():
            report_path = filepath.with_suffix(".html")
        else:
            logger.warning(
                f"no html report found; to attach one, create an .html export for your {filepath.suffix} file and then run: lamin save {filepath}"
            )
    if report_path is not None and is_r_notebook and not from_cli:  # R notebooks
        # see comment above in check_filepath_recently_saved
        recently_saved_time = 0.3 if not is_retry else 20
        if get_seconds_since_modified(report_path) > recently_saved_time:
            # the automated retry solution of Jupyter notebooks does not work in RStudio because the execution of the notebook cell
            # seems to block the event loop of the frontend
            if not is_retry:
                logger.warning(get_save_notebook_message_retry())
                return "retry"
            else:
                logger.warning(
                    "the notebook on disk hasn't been saved within the last 20 sec"
                )
            save_source_code_and_report = False
    ln.settings.creation.artifact_silence_missing_run_warning = True
    # save source code
    if save_source_code_and_report:
        return_code = transform._update_source_code_from_path(source_code_path)
        if return_code == "rerun-the-notebook":
            return "rerun-the-notebook"
    if run is not None:
        base_path = ln_setup.settings.cache_dir / "environments" / f"run_{run.uid}"
        paths = [base_path / "run_env_pip.txt", base_path / "r_environment.txt"]
        existing_paths = [path for path in paths if path.exists()]
        if len(existing_paths) == 2:
            # let's not store the python environment for an R session for now
            existing_paths = [base_path / "r_environment.txt"]

        if existing_paths:
            overwrite_env = True
            if run.environment_id is not None and from_cli:
                logger.important("run.environment is already saved, ignoring")
                overwrite_env = False

            if overwrite_env:
                # Use directory if multiple files exist, otherwise use the single file
                artifact_path: Path = (
                    base_path if len(existing_paths) > 1 else existing_paths[0]
                )

                # Set description based on what we're saving
                if len(existing_paths) == 1:
                    if existing_paths[0].name == "run_env_pip.txt":
                        description = "requirements.txt"
                    elif existing_paths[0].name == "r_environment.txt":
                        description = "r_environment.txt"
                    size, env_hash, _ = hash_file(artifact_path)
                else:
                    description = "environments"
                    size, env_hash, _, _ = hash_dir(artifact_path)

                artifact = (
                    ln.Artifact.objects.filter(hash=env_hash)
                    .exclude(
                        size=0
                    )  # exclude empty files, which may occur for one reason or another
                    .one_or_none()
                )
                new_env_artifact = artifact is None

                if new_env_artifact:
                    if size > 0:
                        artifact = ln.Artifact(
                            artifact_path,
                            description=description,
                            kind="__lamindb_run__",
                            run=False,
                        )
                        artifact.save(upload=True, print_progress=False)
                    else:
                        logger.warning(
                            "environment file is empty, skipping linking an environment"
                        )

                run.environment = artifact
                if new_env_artifact:
                    logger.debug(f"saved run.environment: {run.environment}")

    # set finished_at
    if finished_at and run is not None:
        if not from_cli:
            update_finished_at = True
        else:
            update_finished_at = run.finished_at is None
        if update_finished_at:
            run.finished_at = datetime.now(timezone.utc)

    # track report and set is_consecutive
    if save_source_code_and_report and not skip_save_report:
        if run is not None:
            # do not save a run report if executing through nbconvert
            if report_path is not None and notebook_runner != "nbconvert":
                if is_r_notebook:
                    title_text, report_path = clean_r_notebook_html(report_path)
                    if title_text is not None:
                        transform.description = title_text
                if run.report_id is not None:
                    _, hash, _ = hash_file(report_path)  # ignore hash_type for now
                    if hash != run.report.hash:
                        response = input(
                            f"You are about to overwrite an existing report (hash '{run.report.hash}') for Run('{run.uid}'). Proceed? (y/n) "
                        )
                        if response == "y":
                            run.report.replace(report_path)
                            run.report.save(upload=True, print_progress=False)
                        else:
                            logger.important("keeping old report")
                    else:
                        logger.important("report is already saved")
                else:
                    report_file = ln.Artifact(  # type: ignore
                        report_path,
                        description=f"Report of run {run.uid}",
                        kind="__lamindb_run__",  # hidden file
                        run=False,
                    )
                    report_file.save(upload=True, print_progress=False)
                    run.report = report_file
                if is_r_notebook:
                    # this is the "cleaned" report
                    report_path.unlink()
                logger.debug(
                    f"saved transform.latest_run.report: {transform.latest_run.report}"
                )
            run._is_consecutive = is_consecutive
        if report_path is not None and notebook_runner == "nbconvert":
            logger.important(f"to save the notebook html, run: lamin save {filepath}")

    # save both run & transform records if we arrive here
    if run is not None:
        run.save()
    transform_id_prior_to_save = transform.id
    transform.save()  # this in-place updates the state of transform upon hash collision
    if transform.id != transform_id_prior_to_save:
        # the hash existed and we're actually back to the previous version
        # hence, this was in fact a run of the previous transform rather than of
        # the new transform
        # this can happen in interactively executed notebooks with a pro-active version bump in case it turns out that the user didn't make a change to the notebook
        run.transform = transform
        run.save()
        ln.Transform.get(transform_id_prior_to_save).delete(permanent=True)

    # finalize
    if finished_at and not from_cli and run is not None:
        run_time = run.finished_at - run.started_at
        days = run_time.days
        seconds = run_time.seconds
        hours = seconds // 3600
        minutes = (seconds % 3600) // 60
        secs = seconds % 60
        formatted_run_time = (
            f"{days}d"
            if days != 0
            else "" + f"{hours}h"
            if hours != 0
            else "" + f"{minutes}m"
            if minutes != 0
            else "" + f"{secs}s"
        )

        logger.important(
            f"finished Run('{run.uid}') after {formatted_run_time} at {format_field_value(run.finished_at)}"
        )
    if ln_setup.settings.instance.is_on_hub:
        instance_slug = ln_setup.settings.instance.slug
        if save_source_code_and_report:
            ui_url = ln_setup.settings.instance.ui_url
            logger.important(
                f"{message_prefix}: {ui_url}/{instance_slug}/transform/{transform.uid}"
            )
        if finished_at and not from_cli and save_source_code_and_report:
            thing = "notebook" if (is_ipynb or is_r_notebook) else "script"
            logger.important(
                f"to update your {thing} from the CLI, run: lamin save {filepath}"
            )
    if not save_source_code_and_report:
        logger.warning(
            f"did *not* save source code and report -- to do so, run: lamin save {filepath}"
        )
    return None


================================================
FILE: lamindb/_secret_redaction.py
================================================
from __future__ import annotations

import re

REDACTED_SECRET_VALUE = "***REDACTED***"  # noqa: S105
SENSITIVE_PARAM_KEY_PATTERN = re.compile(
    r"(^|[_\-.])(api[_-]?key|access[_-]?key|secret|token|password|passwd|private[_-]?key|client[_-]?secret)($|[_\-.])"
)

# Match only quoted literals in assignments, e.g.:
# - my_secret = "value"
# - my.secret: "value"
# - mySecret := "value"
# We intentionally do not match unquoted RHS values to avoid false positives like
# type annotations (`api_key: str`) or variable forwarding (`api_key=api_key`).
_KEY_VALUE_ASSIGNMENT_PATTERN = re.compile(
    r"(?P<prefix>(?P<key>[A-Za-z_][A-Za-z0-9_.\-]*)\s*(?P<op>:=|=|:)\s*)"
    r"(?P<value>(?P<quote>['\"`])(?P<quoted>.*?)(?P=quote))"
)

# Match: os.environ["API_KEY"] = "value"
_ENV_ASSIGNMENT_PATTERN = re.compile(
    r"(?P<prefix>os\.environ\[\s*(?P<kquote>['\"])(?P<key>[^'\"]+)(?P=kquote)\s*\]\s*=\s*)"
    r"(?P<value>(?P<quote>['\"`])(?P<quoted>.*?)(?P=quote))"
)

# Match: {"client_secret": "value"}
_QUOTED_KEY_ASSIGNMENT_PATTERN = re.compile(
    r"(?P<prefix>(?P<kquote>['\"])(?P<key>[^'\"]+)(?P=kquote)\s*:\s*)"
    r"(?P<value>(?P<quote>['\"`])(?P<quoted>.*?)(?P=quote))"
)

# We intentionally treat env lookups as safe/re-runnable references, not embedded secrets.
# Examples that should remain unchanged:
# - api_key = os.getenv("OPENAI_API_KEY")
# - api_key = getenv("OPENAI_API_KEY")
# - api_key = os.environ["OPENAI_API_KEY"]
# - api_key = os.environ.get("OPENAI_API_KEY")
_ENV_REFERENCE_VALUE_PATTERN = re.compile(
    r"^(os\.getenv\(.+\)|getenv\(.+\)|os\.environ\[[^\]]+\]|os\.environ\.get\(.+\))$"
)

# Match PostgreSQL URLs that include inline credentials:
# - postgresql://user:password@host:5432/dbname
# - postgres://user:password@host/dbname?sslmode=require
_POSTGRES_CREDENTIALS_URL_PATTERN = re.compile(
    r"^postgres(?:ql)?://[^:@/\s]+:[^@/\s]+@[^/\s]+(?:/[^\s]*)?$",
    re.IGNORECASE,
)


def normalize_sensitive_key_name(key: str) -> str:
    normalized_key = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", key)
    normalized_key = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", normalized_key).lower()
    return normalized_key


def is_sensitive_param_key(key: str) -> bool:
    return bool(SENSITIVE_PARAM_KEY_PATTERN.search(normalize_sensitive_key_name(key)))


def is_sensitive_param_value(value: object) -> bool:
    if not isinstance(value, str):
        return False
    return bool(_POSTGRES_CREDENTIALS_URL_PATTERN.match(value.strip()))


def _redact_assignment_match(match: re.Match[str]) -> str:
    key = match.group("key")
    quoted_value = match.group("quoted")
    if not is_sensitive_param_key(key) and not is_sensitive_param_value(quoted_value):
        return match.group(0)
    # Redact only hardcoded values, not environment-based references.
    # This preserves reproducibility for source code that reads secrets from env vars.
    raw_value = match.group("value")
    if _ENV_REFERENCE_VALUE_PATTERN.match(raw_value):
        return match.group(0)
    quote = match.group("quote")
    redacted_value = (
        f"{quote}{REDACTED_SECRET_VALUE}{quote}"
        if quote is not None
        else REDACTED_SECRET_VALUE
    )
    return f"{match.group('prefix')}{redacted_value}"


def redact_secrets_in_source_code(source_code: str) -> tuple[str, int]:
    redaction_count = 0

    def replace_with_count(match: re.Match[str]) -> str:
        nonlocal redaction_count
        replaced = _redact_assignment_match(match)
        if replaced != match.group(0):
            redaction_count += 1
        return replaced

    redacted = _ENV_ASSIGNMENT_PATTERN.sub(replace_with_count, source_code)
    redacted = _KEY_VALUE_ASSIGNMENT_PATTERN.sub(replace_with_count, redacted)
    redacted = _QUOTED_KEY_ASSIGNMENT_PATTERN.sub(replace_with_count, redacted)
    return redacted, redaction_count


================================================
FILE: lamindb/_view.py
================================================
from __future__ import annotations

import builtins
import importlib
import inspect
from typing import TYPE_CHECKING

from lamin_utils import colors, logger
from lamindb_setup import settings
from lamindb_setup._init_instance import get_schema_module_name

from lamindb.models import Feature, JsonValue, SQLRecord

from .models.feature import serialize_pandas_dtype

if TYPE_CHECKING:
    import pandas as pd

is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)


def display_df_with_descriptions(
    df: pd.DataFrame, descriptions: dict[str, str] | None = None
):
    from IPython.display import HTML, display

    if descriptions is None:
        display(df)
        return None

    # Start building HTML table
    html = '<table class="dataframe">'

    # Create header with title and description rows
    html += "<thead>"

    # Column names row
    html += "<tr>"
    html += '<th class="header-title index-header"></th>'  # Index header
    for col in df.columns:
        html += f'<th class="header-title">{col}</th>'
    html += "</tr>"

    # Descriptions row
    html += "<tr>"
    html += f'<th class="header-desc index-header">{df.index.name or ""}</th>'  # Index column
    for col in df.columns:
        desc = descriptions.get(col, "")
        html += f'<th class="header-desc">{desc}</th>'
    html += "</tr>"

    html += "</thead>"

    # Add body rows
    html += "<tbody>"
    for idx, row in df.iterrows():
        html += "<tr>"
        html += f'<th class="row-index">{idx}</th>'  # Index value
        for col in df.columns:
            html += f"<td>{row[col]}</td>"
        html += "</tr>"
    html += "</tbody>"
    html += "</table>"

    # Add CSS styles
    styled_html = f"""
    <style>
        .dataframe {{
            border-collapse: collapse;
            margin: 10px 0;
        }}
        .dataframe th, .dataframe td {{
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }}
        .header-title {{
            font-weight: bold;
        }}
        .header-desc {{
            color: #666;
            font-weight: normal;
        }}
        .row-index {{
            font-weight: bold;
        }}
        .index-header {{
            font-weight: bold;
        }}
    </style>
    {html}
    """
    return display(HTML(styled_html))


def view(
    *,
    limit: int = 7,
    modules: str | None = None,
    registries: list[str] | None = None,
    df: pd.DataFrame | None = None,
) -> None:
    """View metadata.

    Args:
        limit: Display the latest `n` records
        modules: schema module to view. Default's to
            `None` and displays all registry modules.
        registries: List of SQLRecord names. Defaults to
            `None` and lists all registries.
        df: A DataFrame to display.
    """
    if df is not None:
        descriptions = {
            col_name: serialize_pandas_dtype(dtype)
            for col_name, dtype in df.dtypes.to_dict().items()
        }
        feature_dtypes = dict(Feature.objects.values_list("name", "dtype"))
        descriptions.update(feature_dtypes)
        display_df_with_descriptions(df, descriptions)
        return None

    if is_run_from_ipython:
        from IPython.display import display as show
    else:
        show = logger.print

    if modules is not None:
        module_names = [modules]
    else:
        module_names = ["core"] + list(settings.instance.modules)

    for module_name in module_names:
        schema_module = importlib.import_module(get_schema_module_name(module_name))
        # the below is necessary because a schema module might not have been
        # explicitly accessed
        importlib.reload(schema_module)

        all_registries = {
            registry
            for registry in schema_module.__dict__.values()
            if inspect.isclass(registry)
            and issubclass(registry, SQLRecord)
            and registry is not SQLRecord
        }
        if module_name == "core":
            all_registries.update({JsonValue})
        if registries is not None:
            filtered_registries = {
                registry
                for registry in all_registries
                if registry.__name__ in registries
            }
        else:
            filtered_registries = all_registries
        if len(module_names) > 1:
            section = f"* module: {colors.green(colors.bold(module_name))} *"
            section_no_color = f"* module: {module_name} *"
            logger.print("*" * len(section_no_color))
            logger.print(section)
            logger.print("*" * len(section_no_color))
        for registry in sorted(filtered_registries, key=lambda x: x.__name__):
            df = registry.to_dataframe(limit=limit)
            if df.shape[0] > 0:
                logger.print(colors.blue(colors.bold(registry.__name__)))
                show(df)


================================================
FILE: lamindb/base/__init__.py
================================================
"""Base library.

Is available also when no instance is setup.

Modules
-------

.. autosummary::
   :toctree: .

   uids
   types
   fields
   dtypes
   utils

"""

from . import dtypes, fields, types, uids, utils
from .utils import deprecated, doc_args

__all__ = ["dtypes", "fields", "types", "uids", "utils"]


================================================
FILE: lamindb/base/dtypes.py
================================================
"""Dtype utils.

.. autofunction:: check_dtype

"""

from datetime import datetime
from typing import Any, Callable, Iterable

import numpy as np


def is_list_of_type(value: Any, expected_type: Any) -> bool:
    """Helper function to check if a value is either of expected_type or a list of that type, or a mix of both in a nested structure."""
    if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
        # handle nested lists recursively
        return all(isinstance(item, expected_type) for item in value)
    return False


def check_dtype(expected_type: Any, nullable: bool) -> Callable:
    """Creates a check function for Pandera that validates a column's dtype.

    Supports both standard dtype checking and mixed list/single values for the same type.
    For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.

    Args:
        expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')

    Returns:
        A function that checks if a series has the expected dtype or contains mixed types
    """
    import pandas as pd

    from lamindb.models.query_set import SQLRecordList

    def check_function(series):
        # empty series are considered valid if feature is nullable
        # the issue is that nullable in Pandera controls whether None/NaN values are allowed in the column, not whether the column can be empty (0 rows).
        # so "col": [1, 2, None, 4] is correctly handled by pandera nullable=True, but an empty column "col": [] is not.
        if nullable and series.isnull().all():
            return True
        # first check if the series is entirely of the expected dtype (fast path)
        if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype):
            return True
        elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype):
            return True
        elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype):
            return True
        elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
            return True
        elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype):
            return True
        elif expected_type == "url" and pd.api.types.is_string_dtype(series.dtype):
            return True
        elif expected_type == "bool" and pd.api.types.is_bool_dtype(series.dtype):
            return True

        # if we're here, it might be a mixed column with object dtype
        # need to check each value individually
        if series.dtype == "object" and expected_type.startswith("list"):
            expected_type_member = expected_type.replace("list[", "").removesuffix("]")
            if expected_type_member == "int":
                return series.apply(lambda x: is_list_of_type(x, int)).all()
            elif expected_type_member == "float":
                return series.apply(lambda x: is_list_of_type(x, float)).all()
            elif expected_type_member == "bool":
                return series.apply(lambda x: is_list_of_type(x, bool)).all()
            elif expected_type_member == "num":
                # for numeric, accept either int or float
                return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
            elif (
                expected_type_member == "str"
                or expected_type_member == "path"
                or expected_type_member == "url"
                or expected_type_member.startswith("cat[")
            ):
                return series.apply(lambda x: is_list_of_type(x, str)).all()
            elif expected_type_member == "list":
                return series.apply(
                    lambda x: isinstance(x, (list, np.ndarray, SQLRecordList))
                ).all()

        # if we get here, the validation failed
        return False

    return check_function


def is_valid_datetime_str(date_string: str) -> bool | str:
    try:
        dt = datetime.fromisoformat(date_string)
        return dt.isoformat()
    except ValueError:
        return False


def is_iterable_of_sqlrecord(value: Any):
    from lamindb.models import SQLRecord

    return isinstance(value, Iterable) and isinstance(next(iter(value)), SQLRecord)


================================================
FILE: lamindb/base/fields.py
================================================
"""Fields.

Django fields with modified default arguments.

.. autoclass:: CharField
.. autoclass:: TextField
.. autoclass:: ForeignKey
.. autoclass:: BooleanField
.. autoclass:: DateField
.. autoclass:: DateTimeField
.. autoclass:: BigIntegerField
.. autoclass:: IntegerField
.. autoclass:: OneToOneField
.. autoclass:: FloatField
.. autoclass:: DecimalField
.. autoclass:: BinaryField
.. autoclass:: JSONField
.. autoclass:: EmailField
.. autoclass:: TimeField
.. autoclass:: SlugField
.. autoclass:: URLField
.. autoclass:: UUIDField
.. autoclass:: PositiveIntegerField
.. autoclass:: PositiveSmallIntegerField
.. autoclass:: SmallIntegerField
.. autoclass:: GenericIPAddressField
.. autoclass:: DurationField
"""

from django.db import models


class CharField(models.CharField):
    """Custom `CharField` with default values for `blank`, `default`, and `max_length`.

    Django default values for `CharField` are `blank=False`, `default=""`, undefined `max_length`.
    """

    def __init__(self, max_length: int = 255, **kwargs):
        kwargs["max_length"] = max_length  # Set max_length in kwargs
        kwargs.setdefault("blank", True)
        kwargs.setdefault("default", None)
        super().__init__(**kwargs)  # Pass all arguments as kwargs


class TextField(models.TextField):
    """Custom `TextField` with default values for `blank` and `default`.

    Django default values for `TextField` are `blank=False`, `default=''`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        kwargs.setdefault("default", None)
        super().__init__(*args, **kwargs)


class ForeignKey(models.ForeignKey):
    """Custom `ForeignKey` with default values for `blank`.

    Django default value for `ForeignKey` `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


# fix doc string that otherwise errors
ForeignKey.get_extra_descriptor_filter.__doc__ = (
    ForeignKey.get_extra_descriptor_filter.__doc__.replace(
        ".filter(**kwargs)", "`.filter(**kwargs)`"
    )
)


class BooleanField(models.BooleanField):
    """Custom `BooleanField` with default values for `blank` and `default`.

    Django default values for `BooleanField` are `blank=False`, `default=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        kwargs.setdefault("default", None)
        super().__init__(*args, **kwargs)


class DateField(models.DateField):
    """Custom `DateField` with default values for `blank`.

    Django default values for `DateField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class DateTimeField(models.DateTimeField):
    """Custom `DateTimeField` with default values for `blank`.

    Django default values for `DateTimeField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class BigIntegerField(models.BigIntegerField):
    """Custom `BigIntegerField` with default values for `blank`.

    Django default values for `BigIntegerField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        kwargs.setdefault("default", None)
        super().__init__(*args, **kwargs)


class IntegerField(models.IntegerField):
    """Custom `IntegerField` with default values for `blank`.

    Django default values for `IntegerField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class OneToOneField(models.OneToOneField):
    """Custom `OneToOneField` with default values for `blank`.

    Django default values for `OneToOneField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class FloatField(models.FloatField):
    """Custom `FloatField` with default values for `blank`.

    Django default values for `FloatField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class DecimalField(models.DecimalField):
    """Custom `DecimalField` with default values for `blank`.

    Django default values for `DecimalField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class JSONField(models.JSONField):
    """Custom `JSONField` with default values for `blank`.

    Django default values for `JSONField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class DurationField(models.DurationField):
    """Custom `DurationField` with default values for `blank`.

    Django default values for `DurationField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class URLField(models.URLField):
    """Custom `URLField` with default values for `blank`.

    Django default values for `URLField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class EmailField(models.EmailField):
    """Custom `EmailField` with default values for `blank`.

    Django default values for `EmailField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class TimeField(models.TimeField):
    """Custom `TimeField` with default values for `blank`.

    Django default values for `TimeField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class SlugField(models.SlugField):
    """Custom `SlugField` with default values for `blank`.

    Django default values for `SlugField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class UUIDField(models.UUIDField):
    """Custom `UUIDField` with default values for `blank`.

    Django default values for `UUIDField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class PositiveIntegerField(models.PositiveIntegerField):
    """Custom `PositiveIntegerField` with default values for `blank`.

    Django default values for `PositiveIntegerField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class PositiveSmallIntegerField(models.PositiveSmallIntegerField):
    """Custom `PositiveSmallIntegerField` with default values for `blank`.

    Django default values for `PositiveSmallIntegerField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class SmallIntegerField(models.SmallIntegerField):
    """Custom `SmallIntegerField` with default values for `blank`.

    Django default values for `SmallIntegerField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class BinaryField(models.BinaryField):
    """Custom `BinaryField` with default values for `blank`.

    Django default values for `BinaryField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


class GenericIPAddressField(models.GenericIPAddressField):
    """Custom `GenericIPAddressField` with default values for `blank`.

    Django default values for `GenericIPAddressField` are `blank=False`.
    """

    def __init__(self, *args, **kwargs):
        kwargs.setdefault("blank", True)
        super().__init__(*args, **kwargs)


================================================
FILE: lamindb/base/ids.py
================================================
from .uids import *  # noqa: F403


================================================
FILE: lamindb/base/types.py
================================================
"""Base types.

Central object types
--------------------

.. autoclass:: ArtifactKind
.. autoclass:: TransformKind
.. autoclass:: BlockKind
.. autoclass:: BranchStatus
.. autoclass:: RunStatus
.. autoclass:: DtypeStr

Basic types
-----------

.. autoclass:: AnyPathStr
.. autoclass:: StrField
.. autoclass:: ListLike
.. autoclass:: FieldAttr
"""

from __future__ import annotations

import datetime
from typing import TYPE_CHECKING, Literal, Union

import numpy as np
from django.db.models.query_utils import DeferredAttribute as FieldAttr
from lamindb_setup.types import AnyPathStr  # noqa: F401

if TYPE_CHECKING:
    import pandas as pd

# need to use Union because __future__.annotations doesn't do the job here <3.10
# typing.TypeAlias, >3.10 on but already deprecated
# pd.Series as string to avoid importing pandas at runtime
ListLike = Union[list[str], "pd.Series", np.ndarray]
StrField = Union[str, FieldAttr]  # typing.TypeAlias

TransformKind = Literal["pipeline", "notebook", "script", "function"]
TransformType = TransformKind  # backward compat
ArtifactKind = Literal[
    "dataset", "model", "plan", "__lamindb_run__", "__lamindb_config__"
]
BlockKind = Literal["readme", "comment"]
"""Block kind, a `README.md`-type page or comment.

Any block expects Markdown as the formatting language.
"""

BranchStatus = Literal["standalone", "draft", "review", "merged", "closed"]
"""Branch status.

=============  =====  ==================================================
status         code   description
=============  =====  ==================================================
`closed`       -2     Change Request was closed without merging.
`merged`       -1     The branch was merged into another branch.
`standalone`   0      A standalone branch without Change Request.
`draft`        1      Change Request exists but is not ready for review.
`review`       2      Change Request is ready for review.
=============  =====  ==================================================

The database stores the branch status as an integer code in field `_status_code`.
"""

RunStatus = Literal[
    "scheduled", "restarted", "started", "completed", "errored", "aborted"
]
"""Run status.

===========  =====  ===========================
status       code   description
===========  =====  ===========================
`scheduled`  -3     The run is scheduled.
`restarted`  -2     The run was restarted.
`started`    -1     The run has started.
`completed`  0      The run completed successfully.
`errored`    1      The run ended with an error.
`aborted`    2      The run was aborted.
===========  =====  ===========================

The database stores the run status as an integer code in field `_status_code`.
"""

RUN_STATUS_TO_CODE: dict[RunStatus, int] = {
    "scheduled": -3,
    "restarted": -2,
    "started": -1,
    "completed": 0,
    "errored": 1,
    "aborted": 2,
}
RUN_CODE_TO_STATUS: dict[int, RunStatus] = {
    code: status for status, code in RUN_STATUS_TO_CODE.items()
}

BRANCH_STATUS_TO_CODE: dict[BranchStatus, int] = {
    "closed": -2,
    "merged": -1,
    "standalone": 0,
    "draft": 1,
    "review": 2,
}
BRANCH_CODE_TO_STATUS: dict[int, BranchStatus] = {
    code: status for status, code in BRANCH_STATUS_TO_CODE.items()
}

DtypeObject = int | float | str | bool | datetime.date | datetime.datetime | dict

DtypeStr = Literal[
    "num",  # numericals
    "int",  # integer / numpy.integer
    "float",  # float
    "str",  # string
    "bool",  # boolean
    "datetime",  # datetime
    "date",  # date
    "dict",  # dictionary
    "path",  # path, validated as str, but specially treated in the UI
    "url",  # URL, validated as str, but specially treated in the UI
    "object",  # this is a pandas input dtype, we're only using it for complicated types, not for strings; consciously currently not documented
]
"""String-serialized representations of common data types.

============  ============  =================================================
description   lamindb       pandas
============  ============  =================================================
numerical     `"num"`       `int | float`
integer       `"int"`       `int64 | int32 | int16 | int8 | uint | ...`
float         `"float"`     `float64 | float32 | float16 | float8 | ...`
string        `"str"`       `object`
boolean       `"bool"`      `boolean | bool`
datetime      `"datetime"`  `datetime`
date          `"date"`      `object` (pandera requires an ISO-format string, convert with `df["date"] = df["date"].dt.date`)
dictionary    `"dict"`      `object`
path          `"path"`      `str` (pandas does not have a dedicated path type, validated as `str`)
url           `"url"`       `str` (pandas does not have a dedicated url type, validated as `str`)
============  ============  =================================================

.. admonition:: Categorical and relational data types

    These are **not** contained in the `DTypeStr` `Literal`.

    For any categorical, you can restrict the permissible values to the values defined in a registry.
    When serializing this to a string, then `'cat[ULabel]'` or `'cat[bionty.CellType]'` indicate that permissible values are stored in the `name` field of the `ULabel` or `CellType` registry, respectively.
    You can also restrict to sub-types defined in registries via the `type` field, e.g., `'cat[ULabel[123456ABCDEFG]]'` indicates that values must be of the type with `uid="123456ABCDEFG"` within the `ULabel` registry.

    In LaminDB, categoricals define relationships with registries. See :class:`~lamindb.Feature` for more details.

"""
Dtype = DtypeStr  # backward compat

RegistryId = Literal[
    "__lamindb_artifact__",
    "__lamindb_block__",
    "__lamindb_collection__",
    "__lamindb_feature__",
    "__lamindb_jsonvalue__",
    "__lamindb_project__",
    "__lamindb_record__",
    "__lamindb_run__",
    "__lamindb_schema__",
    "__lamindb_storage__",
    "__lamindb_transform__",
    "__lamindb_ulabel__",
]


================================================
FILE: lamindb/base/uids.py
================================================
"""Universal IDs.

Base generators
===============

.. autofunction:: base26
.. autofunction:: base62
.. autofunction:: base64

UID generators
================

.. autofunction:: base62_8
.. autofunction:: base62_12
.. autofunction:: base62_16
.. autofunction:: base62_20

Collision probabilities
=======================

8 base62 characters (`62**8=2e+14`):

======= ===========
n       p_collision
======= ===========
100k    2e-05
1M      2e-03
======= ===========

12 base62 characters (`62**12=3e+21`):

======= ===========
n       p_collision
======= ===========
100M    2e-06
1B      2e-04
======= ===========

16 base62 characters (`62**16=5e+28`):

======= ===========
n       p_collision
======= ===========
1e12    7e-05
1e13    7e-03
======= ===========

20 base62 characters (`62**20=7e+35`) roughly matches UUID (`2**122=5e+36`):

======= ===========
n       p_collision
======= ===========
1e16    7e-05
1e17    7e-03
======= ===========

See `source <https://lamin.ai/laminlabs/lamindata/transform/t2xCdMB9v5wL>`__.

"""

import secrets
import string


def base64(n_char: int) -> str:
    """Random Base64 string."""
    alphabet = string.digits + string.ascii_letters.swapcase() + "_" + "-"
    uid = "".join(secrets.choice(alphabet) for i in range(n_char))
    return uid


def base62(n_char: int) -> str:
    """Random Base62 string."""
    alphabet = string.digits + string.ascii_letters.swapcase()
    uid = "".join(secrets.choice(alphabet) for i in range(n_char))
    return uid


def base26(n_char: int):
    """ASCII lowercase."""
    alphabet = string.ascii_lowercase
    uid = "".join(secrets.choice(alphabet) for i in range(n_char))
    return uid


def base62_4() -> str:
    return base62(4)


def base62_8() -> str:
    """Random Base62 string of length 8."""
    return base62(8)


def base62_12() -> str:
    """Random Base62 string of length 12."""
    return base62(12)


def base62_16() -> str:
    """Random Base62 string of length 16."""
    return base62(16)


def base62_20() -> str:
    """Random Base62 string of length 20."""
    return base62(20)


def base62_24() -> str:
    """Random Base62 string of length 24."""
    return base62(24)


================================================
FILE: lamindb/base/users.py
================================================
user_id_cache = {}


def _user_has_write_access() -> bool:
    from django.db import connection

    with connection.cursor() as cursor:
        cursor.execute("""
            SELECT EXISTS (
                SELECT 1 FROM check_access() chk
                WHERE chk.role in ('write', 'admin')
            )
        """)
        return cursor.fetchone()[0]


def current_user_id() -> int:
    import lamindb_setup as ln_setup
    from lamindb_setup import settings
    from lamindb_setup._init_instance import register_user

    from lamindb.errors import NoWriteAccess
    from lamindb.models import User

    def query_user_id():
        if ln_setup.core.django.IS_MIGRATING:
            return 1
        else:
            user = settings.user
            user_uid = user.uid
            try:
                user_id = User.objects.get(uid=user_uid).id
            except User.DoesNotExist:
                register_user(user)
                try:
                    user_id = User.objects.get(uid=user_uid).id
                except User.DoesNotExist as e:
                    isettings = settings.instance
                    if isettings.is_read_only_connection:
                        raise NoWriteAccess(
                            "Unable to register a new user in the instance database "
                            "because you have a read-only connection."
                        ) from e
                    if (
                        isettings._db_permissions == "jwt"
                        and not _user_has_write_access()
                    ):
                        raise NoWriteAccess(
                            "Unable to register a new user in the instance database "
                            "because you don't have write access to any space or registry."
                        ) from e
                    raise e
            return user_id

    if settings._instance_exists:
        slug = settings.instance.slug
        if slug not in user_id_cache:
            user_id_cache[slug] = query_user_id()
        return user_id_cache[slug]
    else:
        return query_user_id()


================================================
FILE: lamindb/base/utils.py
================================================
"""Utilities.

.. autodecorator:: doc_args
.. autodecorator:: deprecated
.. autodecorator:: class_and_instance_method
.. autodecorator:: strict_classmethod

"""

from functools import wraps
from types import MethodType

from lamindb_setup.core import deprecated, doc_args


class class_and_instance_method:
    """Decorator to define a method that works both as class and instance method."""

    def __init__(self, func):
        self.func = func
        wraps(func)(self)

    def __get__(self, instance, owner):
        if instance is None:
            # Called on the class
            return MethodType(self.func, owner)
        else:
            # Called on an instance
            return MethodType(self.func, instance)


class strict_classmethod:
    """Decorator for a classmethod that raises an error when called on an instance."""

    def __init__(self, func):
        self.func = func
        wraps(func)(self)

    def __get__(self, instance, owner):
        if instance is not None:
            # Called on an instance - raise immediately
            raise TypeError(
                f"{owner.__name__}.{self.func.__name__}() is a class method and must be called on the {owner.__name__} class, not on a {owner.__name__} object"
            )

        # Called on the class - return bound method using MethodType
        return MethodType(self.func, owner)


__all__ = [
    "doc_args",
    "deprecated",
    "class_and_instance_method",
    "strict_classmethod",
]


================================================
FILE: lamindb/core/__init__.py
================================================
"""Core library.

Settings & context:

.. autosummary::
   :toctree: .

   Settings
   subsettings
   Context

Artifact loaders:

.. autosummary::
   :toctree: .

   loaders

Data loaders:

.. autosummary::
   :toctree: .

   MappedCollection

Modules:

.. autosummary::
   :toctree: .

   storage
   logger

"""

from lamin_utils import logger
from lamin_utils._inspect import InspectResult

from .. import errors as exceptions  # backward compat
from ..base import types  # backward compat
from ..examples import datasets  # backward compat
from . import subsettings
from ._context import Context
from ._settings import Settings


def __getattr__(name: str):
    # need to lazy import a few auxliary modules to maintain backward compatibility
    # none of them should have been eagerly imported in the first place
    import importlib

    if name == "loaders":
        loaders = importlib.import_module(".loaders", package=__name__)
        globals()[name] = loaders
        return loaders
    if name == "storage":
        storage = importlib.import_module(".storage", package=__name__)
        globals()[name] = storage
        return storage
    if name == "MappedCollection":
        from ._mapped_collection import MappedCollection

        globals()[name] = MappedCollection
        return MappedCollection
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


================================================
FILE: lamindb/core/_compat.py
================================================
import importlib.util
from typing import Any, Callable, TypeVar

T = TypeVar("T")


def is_package_installed(package_name: str) -> bool:
    spec = importlib.util.find_spec(package_name)
    return spec is not None


def with_package(package_name: str, operation: Callable[[Any], T]) -> T:
    """Execute an operation that requires a specific package.

    Args:
        package_name: Package name (e.g., "mudata")
        operation: Function that takes the imported module and returns a result

    Examples:
        # For direct package functions
        result = with_package("mudata", lambda mod: mod.read_zarr(path))
    """
    try:
        module = importlib.import_module(package_name)
    except ImportError:
        raise ImportError(
            f"Package '{package_name}' is required but not installed. "
            f"Please install with: pip install {package_name}"
        ) from None
    return operation(module)


def with_package_obj(
    obj: Any, class_name: str, package_name: str, operation: Callable[[Any], T]
) -> tuple[bool, T | None]:
    """Handle operations on objects that require specific packages.

    Args:
        obj: The object to operate on
        class_name: Expected class name (e.g., "MuData")
        package_name: Package that provides the class (e.g., "mudata")
        operation: Function to call with the object if package is available.

    Examples:
        # For instance methods
        handled, res = apply_class_func(dmem, "MuData", "mudata",
                                      lambda obj: obj.write(filepath))
    """
    if obj.__class__.__name__ == class_name:
        try:
            importlib.import_module(package_name)
        except ImportError:
            raise ImportError(
                f"Object appears to be {class_name} but '{package_name}' package is not installed. "
                f"Please install with: pip install {package_name}"
            ) from None
        result = operation(obj)
        return True, result

    return False, None


================================================
FILE: lamindb/core/_context.py
================================================
from __future__ import annotations

import builtins
import hashlib
import os
import signal
import sys
import threading
import traceback
from datetime import datetime, timezone
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, TextIO

import lamindb_setup as ln_setup
from django.db.models import Func, IntegerField, Q
from lamin_utils._logger import logger
from lamindb_setup.core.hashing import hash_file, hash_string

from .._secret_redaction import (
    REDACTED_SECRET_VALUE,
    is_sensitive_param_key,
    is_sensitive_param_value,
    redact_secrets_in_source_code,
)
from ..base.uids import base62_12
from ..errors import InvalidArgument, TrackNotCalled, UpdateContext
from ..models import Run, SQLRecord, Transform, format_field_value
from ..models._feature_manager import infer_convert_dtype_key_value
from ..models._is_versioned import bump_version as bump_version_function
from ..models._is_versioned import (
    increment_base62,
)
from ._settings import settings
from ._sync_git import get_transform_reference_from_git_repo
from ._track_environment import track_python_environment

if TYPE_CHECKING:
    from types import FrameType, TracebackType

    from lamindb.base.types import TransformKind
    from lamindb.models import Artifact, Branch, Project, Space


is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)

msg_path_failed = "failed to infer notebook path.\nfix: pass `path` to `ln.track()`"


def get_key_from_module(caller_module: str) -> str:
    if "." in caller_module:
        key_from_module = f"pypackages/{caller_module.replace('.', '/')}.py"
    else:
        key_from_module = None
    return key_from_module


def detect_and_process_source_code_file(
    *,
    path: str | Path | None,
    transform_kind: TransformKind | None = None,
) -> tuple[Path, TransformKind, str, str, str | None]:
    """Track source code file and determine transform metadata.

    For `.py` files, classified as "script".
    For `.Rmd` and `.qmd` files, classified as "notebook" because they
    typically come with an .html run report.

    Package vs script criterion: source code is part of a **package** if the
    caller's module name contains at least one `.` (module nesting goes beyond
    the filename). Otherwise it is a **script** (module nesting stops at the
    filename, e.g. `__main__`, `__mp_main__`, or a single top-level name).

    Args:
        path: Path to the source code file. If None, infers from call stack.

    Returns:
        Tuple of (path, transform_kind, reference, reference_type, key_from_module).
        - path: Path object to the source file
        - transform_kind: "script" or "notebook"
        - reference: Git reference URL if sync_git_repo is set, else None
        - reference_type: "url" if reference exists, else None
        - key_from_module: If caller is part of a package (`.` in __name__),
          `pypackages/module/path/to/file.py`; else None (key will be computed from dev_dir or path.name).

    Raises:
        NotImplementedError: If path cannot be determined from call stack.
    """
    # for `.py` files, classified as "script"
    # for `.Rmd` and `.qmd` files, which we classify
    # as "notebook" because they typically come with an .html run report
    key_from_module: str | None = None
    if path is None:
        import inspect

        frame = inspect.stack()[2]
        path_str = frame[1]
        if not path_str or path_str.startswith("<"):
            raise NotImplementedError(
                "Cannot determine valid file path, pass manually via path (interactive sessions not yet supported)"
            )
        path = Path(path_str)
        # package vs script: nesting beyond filename makes the file part of a python package
        caller_module = frame[0].f_globals.get("__name__", "__main__")
        key_from_module = get_key_from_module(caller_module)
    else:
        path = Path(path)
    # for Rmd and qmd, we could also extract the title
    # we don't do this for now as we're setting the title upon `ln.finish()` or `lamin save`
    # by extracting it from the html while cleaning it: see clean_r_notebook_html()
    # also see the script_to_notebook() in the CLI _load.py where the title is extracted
    # from the source code YAML and updated with the transform description
    # note that ipynb notebooks are handled in a separate function (_track_notebook())
    if transform_kind is None:
        transform_kind = "notebook" if path.suffix in {".Rmd", ".qmd"} else "script"
    reference = None
    reference_type = None
    if settings.sync_git_repo is not None and path.suffix != ".ipynb":
        reference = get_transform_reference_from_git_repo(path)
        reference_type = "url"
    return path, transform_kind, reference, reference_type, key_from_module


def get_uid_ext(version: str) -> str:
    from lamin_utils._base62 import encodebytes

    # merely zero-padding the nbproject version such that the base62 encoding is
    # at least 4 characters long doesn't yields sufficiently diverse hashes and
    # leads to collisions; it'd be nice because the uid_ext would be ordered
    return encodebytes(hashlib.md5(version.encode()).digest())[:4]  # noqa: S324


def get_notebook_path() -> tuple[Path, str]:
    from nbproject.dev._jupyter_communicate import (
        notebook_path as get_notebook_path,
    )

    path = None
    try:
        path, env = get_notebook_path(return_env=True)
    except ValueError as ve:
        raise ve
    except Exception as error:
        raise RuntimeError(msg_path_failed) from error
    if path is None:
        raise RuntimeError(msg_path_failed) from None
    return Path(path), env


# from https://stackoverflow.com/questions/61901628
def get_notebook_key_colab() -> str:
    from socket import gethostbyname, gethostname  # type: ignore

    from requests import get  # type: ignore

    ip = gethostbyname(gethostname())  # 172.28.0.12
    try:
        key = get(f"http://{ip}:9000/api/sessions").json()[0]["name"]  # noqa: S113
        key = f"colab/{key}"
    except Exception:
        logger.warning(
            "could not get notebook key from Google Colab, using: colab/notebook.ipynb"
        )
        key = "colab/notebook.ipynb"
    return key


def get_cli_call() -> tuple[str, str] | None:
    """Returns (tool_name, args) when invoked as a script with CLI arguments.

    Returns None if not run as a script (e.g., in Jupyter, interactive shell)
    or when no arguments were passed.
    """
    if len(sys.argv) > 1 and sys.argv[0] and not is_run_from_ipython:
        return Path(sys.argv[0]).name, " ".join(sys.argv[1:])
    return None


def pretty_pypackages(dependencies: dict) -> str:
    deps_list = []
    for pkg, ver in dependencies.items():
        if ver != "":
            deps_list.append(pkg + f"=={ver}")
        else:
            deps_list.append(pkg)
    deps_list.sort()
    return " ".join(deps_list)


def last_non_empty_r_block(line: str) -> str:
    for block in reversed(line.split("\r")):
        if block:
            return block
    return ""


class LogStreamHandler:
    def __init__(self, log_stream: TextIO, file: TextIO, use_buffer: bool):
        self.log_stream = log_stream
        self.file = file

        self._buffer = ""
        self._use_buffer = use_buffer

    def write(self, data: str) -> int:
        data_length = len(data)

        self.log_stream.write(data)
        if self.file.closed:
            return data_length

        if not self._use_buffer:
            self.file.write(data)
            self.file.flush()
            return data_length

        self._buffer += data
        # write only the last part of a line with carriage returns
        while "\n" in self._buffer:
            if self.file.closed:
                return data_length
            line, self._buffer = self._buffer.split("\n", 1)
            self.file.write(last_non_empty_r_block(line) + "\n")
            self.file.flush()

        return data_length

    def flush(self):
        self.log_stream.flush()
        if not self.file.closed:
            self.file.flush()

    # https://laminlabs.slack.com/archives/C07DB677JF6/p1759423901926139
    # other tracking frameworks like W&B use our output stream and expect
    # certain functions like isatty to be available
    def isatty(self) -> bool:
        return False

    # .flush is sometimes (in jupyter etc.) called after every .write
    # this needs to be called only at the end
    def flush_buffer(self):
        if not self.file.closed and self._buffer:
            self.file.write(last_non_empty_r_block(self._buffer))
            self._buffer = ""
        self.flush()


class LogStreamTracker:
    def __init__(self):
        self.original_stdout = None
        self.original_stderr = None
        self.log_file = None
        self.is_cleaning_up = False
        self.original_excepthook: Callable[
            [type[BaseException], BaseException, TracebackType | None], Any
        ] = sys.excepthook

        self.original_signal_handlers: dict[
            signal.Signals, Callable[[int, FrameType | None], Any] | int
        ] = {}
        if threading.current_thread() == threading.main_thread():
            self.original_signal_handlers[signal.SIGTERM] = signal.getsignal(
                signal.SIGTERM
            )
            self.original_signal_handlers[signal.SIGINT] = signal.getsignal(
                signal.SIGINT
            )

    def start(self, run: Run):
        self.original_stdout = sys.stdout
        self.original_stderr = sys.stderr
        self.run = run
        self.log_file_path = (
            ln_setup.settings.cache_dir / f"run_logs_{self.run.uid}.txt"
        )
        self.log_file = open(self.log_file_path, "w", encoding="utf-8")
        # the instance that's connected is important information
        self.log_file.write(
            f"\x1b[92m→\x1b[0m connected lamindb: {ln_setup.settings.instance.slug}\n"
        )
        # use buffering for correct handling of carriage returns
        sys.stdout = LogStreamHandler(
            self.original_stdout, self.log_file, use_buffer=True
        )
        # write evrything immediately in stderr
        sys.stderr = LogStreamHandler(
            self.original_stderr, self.log_file, use_buffer=False
        )
        # handle signals
        # signal should be used only in the main thread, otherwise
        # ValueError: signal only works in main thread of the main interpreter
        if threading.current_thread() == threading.main_thread():
            signal.signal(signal.SIGTERM, self.cleanup)
            signal.signal(signal.SIGINT, self.cleanup)
        # handle exceptions
        sys.excepthook = self.handle_exception
        # reset handler for lamin logger because sys.stdout has been replaced
        logger.set_handler()

    def finish(self):
        if self.original_stdout:
            getattr(sys.stdout, "flush_buffer", sys.stdout.flush)()
            sys.stderr.flush()
            sys.stdout = self.original_stdout
            sys.stderr = self.original_stderr
            if not self.log_file.closed:
                self.log_file.close()
            # reset handler for lamin logger because sys.stdout has been replaced
            logger.set_handler()

    def cleanup(self, signo=None, frame=None):
        try:
            from .._finish import save_run_logs

            if self.original_stdout and not self.is_cleaning_up:
                self.is_cleaning_up = True
                if signo is not None:
                    if self.log_file.closed:
                        self.log_file = open(self.log_file_path, "a", encoding="utf-8")
                    getattr(sys.stdout, "flush_buffer", sys.stdout.flush)()
                    sys.stderr.flush()
                    signal_msg = f"\nProcess terminated by signal {signo} ({signal.Signals(signo).name})\n"
                    if frame:
                        signal_msg += (
                            f"Frame info:\n{''.join(traceback.format_stack(frame))}"
                        )
                    self.log_file.write(signal_msg)
                    self.log_file.flush()
                    self.run._status_code = 2  # aborted
                else:
                    self.run._status_code = 1  # errored
                self.run.finished_at = datetime.now(timezone.utc)
                sys.stdout = self.original_stdout
                sys.stderr = self.original_stderr
                if not self.log_file.closed:
                    self.log_file.close()
                save_run_logs(self.run, save_run=True)
                # reset handler for lamin logger because sys.stdout has been replaced
                logger.set_handler()
        except:  # noqa: E722, S110
            pass
        finally:
            if signo is not None and signo in self.original_signal_handlers:
                original_handler = self.original_signal_handlers[signo]
                if callable(original_handler):
                    original_handler(signo, frame)

    def handle_exception(self, exc_type, exc_value, exc_traceback):
        try:
            if self.original_stdout and not self.is_cleaning_up:
                if self.log_file.closed:
                    self.log_file = open(self.log_file_path, "a", encoding="utf-8")
                getattr(sys.stdout, "flush_buffer", sys.stdout.flush)()
                sys.stderr.flush()
                error_msg = f"{''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))}"
                self.log_file.write(error_msg)
                self.log_file.flush()
                self.cleanup()
        except:  # noqa: E722, S110
            pass
        finally:
            self.original_excepthook(exc_type, exc_value, exc_traceback)


def serialize_params_to_json(params: dict) -> dict:
    serialized_params = {}
    for key, value in params.items():
        # None and empty list are missing/empty values, skip them consistent with elsewhere in the code
        if value is None or (isinstance(value, list) and len(value) == 0):
            continue
        dtype, converted_value, _ = infer_convert_dtype_key_value(key, value, mute=True)
        # converted_value is not JSON if dtype is a SQLRecord or a list of SQLRecords
        # because we just the above function for features where we'd like to keep SQLRecords as they are
        # so, need to handle this here
        if (
            dtype == "?" or dtype.startswith("cat") or dtype.startswith("list[cat")
        ) and dtype not in {"cat ? str", "list[cat ? str]"}:
            if isinstance(value, SQLRecord):
                serialized_params[key] = (
                    f"{value.__class__.__get_name_with_module__()}[{value.uid}]"
                )
            elif dtype.startswith("list[cat"):
                items = list(value)
                if items and all(isinstance(item, SQLRecord) for item in items):
                    serialized_params[key] = [  # type: ignore
                        f"{item.__class__.__get_name_with_module__()}[{item.uid}]"
                        for item in items
                    ]
        else:
            serialized_params[key] = converted_value
        if key not in serialized_params:
            logger.warning(
                f"skipping param {key} with value {value} and dtype {dtype} not JSON serializable"
            )
            continue
        if is_sensitive_param_key(key) or is_sensitive_param_value(
            serialized_params[key]
        ):
            serialized_params[key] = REDACTED_SECRET_VALUE
    return serialized_params


class Context:
    """Run context.

    Is the book keeper for :func:`~lamindb.track` and :func:`~lamindb.finish`.
    """

    def __init__(self, uid: str | None = None, path: Path | None = None):
        self._uid: str | None = uid
        self._path: Path | None = path
        self._description: str | None = None
        self._version: str | None = None
        self._transform: Transform | None = None
        self._run: Run | None = None
        self._project: Project | None = None
        self._space: Space | None = None
        self._branch: Branch | None = None
        self._logging_message_track: str = ""
        self._logging_message_imports: str = ""
        self._stream_tracker: LogStreamTracker = LogStreamTracker()
        self._is_finish_retry: bool = False
        self._notebook_runner: str | None = None
        self._is_step_decorator_run: bool = False

    @property
    def transform(self) -> Transform | None:
        """Managed transform of context."""
        return self._transform

    @property
    def description(self) -> str | None:
        """`description` argument for `context.transform`."""
        return self._description

    @description.setter
    def description(self, value: str | None):
        self._description = value

    @property
    def uid(self) -> str | None:
        """`uid` argument for `context.transform`."""
        return self._uid

    @uid.setter
    def uid(self, value: str | None):
        self._uid = value

    @property
    def version(self) -> str | None:
        """`version` argument for `context.transform`."""
        return self._version

    @version.setter
    def version(self, value: str | None):
        self._version = value

    @property
    def project(self) -> Project | None:
        """Project to label entities created during the run."""
        return self._project

    @property
    def space(self) -> Space | None:
        """The space in which artifacts, collections, transforms, and runs are saved during the run."""
        return self._space

    @property
    def branch(self) -> Branch | None:
        """The branch on which entities are created during the run."""
        return self._branch

    @property
    def run(self) -> Run | None:
        """Managed run of context."""
        return self._run

    def _track(
        self,
        transform: str | Transform | None = None,
        *,
        project: str | Project | None = None,
        space: str | Space | None = None,
        branch: str | Branch | None = None,
        plan: str | Artifact | None = None,
        features: dict | None = None,
        params: dict | None = None,
        new_run: bool | None = None,
        pypackages: bool | None = None,
        key: str | None = None,
        path: str | Path | None = None,
        source_code: str | None = None,
        kind: TransformKind | None = None,
        entrypoint: str | None = None,
        initiated_by_run: Run | str | None = None,
        stream_tracking: bool | None = None,
    ) -> None:
        """Track a run of a notebook or script.

        Populates the global run :class:`~lamindb.context` with :class:`~lamindb.Transform` & :class:`~lamindb.Run` objects and tracks the compute environment.

        Args:
            transform: A transform (stem) `uid` or object. If `None`, auto-creates a `transform` with its `uid`.
            project: A project or its `name` or `uid` for labeling entities created during the run.
            space: A restricted space or its `name` or `uid` in which to store entities created during the run.
                Default: the `"all"` space. Note that bionty entities ignore this setting and always get written to the `"all"` space.
                If you want to manually move entities to a different space, set the `.space` field (:doc:`docs:permissions`).
            branch: A branch (or its `name` or `uid`) on which to store records.
            plan: A plan, typically an agent plan. Pass an artifact (or its `key` or `uid`).
            features: A dictionary of features & values to track for the run.
            params: A dictionary of params & values to track for the run.
            new_run: If `False`, loads the latest run of transform
                (default notebook), if `True`, creates new run (default non-notebook).
            pypackages: If `True` or `None`, infers Python packages used in a notebook.
            key: Transform key.
            path: Filepath of a notebook or script.
            source_code: Source code.
            kind: Transform kind.
            entrypoint: Optional entrypoint name (e.g. function qualname) for the run.
            initiated_by_run: Optional parent run (or its `uid`) that triggered this run.
                If `None`, falls back to the `LAMIN_INITIATED_BY_RUN_UID` environment variable when set.
            stream_tracking: If set, override whether to capture stdout/stderr to run logs.
                Used by the flow/step decorator: flows get logs (`True`), steps do not (`False`).

        Examples:

            To track the run of a notebook or script:

            .. literalinclude:: scripts/run_track_and_finish.py
               :language: python

            To ensure one version history across file renames::

                ln.track("Onv04I53OgtT")

            To track a project or an agent plan: pass a project/artifact to `ln.track()`, for example::

                ln.track(project="My project", plan="./plans/curate-dataset-x.md")

            Note that you have to create a project or save the agent plan in case it they don't yet exist::

                # create a project in Python
                ln.Project(name="My project").save()

                # create a project with the CLI
                lamin create project "My project"

                # save an agent plan with the CLI
                lamin save /path/to/.cursor/plans/curate-dataset-x.plan.md
                lamin save /path/to/.claude/plans/curate-dataset-x.md

            To sync code with a git repo, see: :ref:`sync-code-with-git`.

            To track parameters and features, see: :ref:`track-run-parameters`.

            To browse more examples, see: :doc:`/track`.
        """
        from lamindb.models import Artifact, Branch, Project, Space

        from .._finish import (
            save_context_core,
        )

        # similar logic here: https://github.com/laminlabs/lamindb/pull/2527
        if ln_setup.settings.instance.is_read_only_connection:
            logger.warning("skipping track(), connected in read-only mode")
            return None
        if project is None:
            project = os.environ.get("LAMIN_CURRENT_PROJECT")
        if project is not None:
            if isinstance(project, Project):
                assert project._state.adding is False, (  # noqa: S101
                    "Project must be saved before passing it to track()"
                )
                project_record = project
            else:
                project_record = Project.filter(
                    Q(name=project) | Q(uid=project)
                ).one_or_none()
                if project_record is None:
                    raise InvalidArgument(
                        f"Project '{project}' not found, either create it with `ln.Project(name='...').save()` or fix typos."
                    )
            self._project = project_record
        if space is not None:
            if isinstance(space, Space):
                assert space._state.adding is False, (  # noqa: S101
                    "Space must be saved before passing it to track()"
                )
                space_record = space
            else:
                space_record = Space.filter(Q(name=space) | Q(uid=space)).one_or_none()
                if space_record is None:
                    raise InvalidArgument(
                        f"Space '{space}', please check on the hub UI whether you have the correct `uid` or `name`."
                    )
            self._space = space_record
        if branch is not None:
            if isinstance(branch, Branch):
                assert branch._state.adding is False, (  # noqa: S101
                    "Branch must be saved before passing it to track()"
                )
                branch_record = branch
            else:
                branch_record = Branch.filter(
                    Q(name=branch) | Q(uid=branch)
                ).one_or_none()
                if branch_record is None:
                    raise InvalidArgument(
                        f"Space '{branch}', please check on the hub UI whether you have the correct `uid` or `name`."
                    )
            self._branch = branch_record
        plan_record: Artifact | None = None
        if plan is not None:
            if isinstance(plan, Artifact):
                assert plan._state.adding is False, (  # noqa: S101
                    "Plan artifact must be saved before passing it to track()"
                )
                plan_record = plan
            else:
                plan_record = Artifact.filter(Q(key=plan) | Q(uid=plan)).one_or_none()
                if plan_record is None:
                    raise InvalidArgument(
                        f"Plan artifact '{plan}' not found, either create it or use a valid key/uid."
                    )
        if initiated_by_run is None:
            initiated_by_run = os.environ.get("LAMIN_INITIATED_BY_RUN_UID")
        initiated_by_run_record: Run | None = None
        if initiated_by_run is not None:
            if isinstance(initiated_by_run, Run):
                assert initiated_by_run._state.adding is False, (  # noqa: S101
                    "initiated_by_run must be saved before passing it to track()"
                )
                initiated_by_run_record = initiated_by_run
            else:
                initiated_by_run_record = Run.filter(uid=initiated_by_run).one_or_none()
                if initiated_by_run_record is None:
                    raise InvalidArgument(
                        f"Run '{initiated_by_run}' not found, please pass a valid run uid."
                    )
        self._logging_message_track = ""
        self._logging_message_imports = ""
        self._is_step_decorator_run = (
            entrypoint is not None and stream_tracking is False
        )
        if transform is not None and isinstance(transform, str):
            self.uid = transform
            transform = None
            uid_was_none = False
        else:
            uid_was_none = True
        self._path = None
        cli_call = get_cli_call()
        if transform is None:
            description = None
            transform_ref = None
            transform_ref_type = None
            if source_code is not None:
                transform_kind = kind if kind is not None else "function"
                assert key is not None, (
                    "`key` cannot be `None` when `source_code` is passed to `track()`."
                )
                assert path is None, (
                    "`path` cannot be passed when `source_code` is passed to `track()`."
                )
            else:
                if is_run_from_ipython:
                    self._path, description = self._track_notebook(
                        path_str=path, pypackages=pypackages
                    )
                    transform_kind = "notebook"
                else:
                    (
                        self._path,
                        transform_kind,
                        transform_ref,
                        transform_ref_type,
                        key_from_module,
                    ) = detect_and_process_source_code_file(path=path)
                    if key is None and key_from_module is not None:
                        key = key_from_module
            if description is None:
                description = self._description
            if description is None and cli_call is not None:
                description = f"CLI: {cli_call[0]}"
            self._create_or_load_transform(
                description=description,
                transform_ref=transform_ref,
                transform_ref_type=transform_ref_type,
                transform_kind=transform_kind,
                key=key,
                source_code=source_code,
            )
        else:
            if transform.kind in {"notebook", "script"}:
                raise ValueError(
                    "Use `ln.track()` without passing transform in a notebook or script"
                    " - metadata is automatically parsed"
                )
            transform_exists = None
            if transform.id is not None:
                # transform has an id but unclear whether already saved
                transform_exists = Transform.filter(id=transform.id).first()
            if transform_exists is None:
                transform.save()
                self._logging_message_track += (
                    f"created Transform('{transform.uid}', key='{transform.key}')"
                )
                transform_exists = transform
            else:
                self._logging_message_track += (
                    f"loaded Transform('{transform.uid}', key='{transform.key}')"
                )
            self._transform = transform_exists

        if new_run is None:  # for notebooks, default to loading latest runs
            new_run = (
                False
                if (
                    self._transform.kind == "notebook"
                    and self._notebook_runner != "nbconvert"
                )
                else True
            )  # type: ignore

        run = None
        if not new_run:  # try loading latest run by same user
            run = (
                Run.filter(
                    transform=self._transform, created_by_id=ln_setup.settings.user.id
                )
                .order_by("-created_at")
                .first()
            )
            if run is not None:  # loaded latest run
                run.started_at = datetime.now(timezone.utc)  # update run time
                run._status_code = -2  # re-started
                if plan_record is not None:
                    run.plan = plan_record
                    run.save()
                entrypoint_str = (
                    f", entrypoint='{entrypoint}'" if entrypoint is not None else ""
                )
                self._logging_message_track += f", re-started Run('{run.uid}'{entrypoint_str}) at {format_field_value(run.started_at)}"

        if run is None:  # create new run
            run = Run(transform=self._transform, plan=plan_record)
            if entrypoint is not None:
                run.entrypoint = entrypoint
            if initiated_by_run_record is not None:
                run.initiated_by_run = initiated_by_run_record
            run.started_at = datetime.now(timezone.utc)
            run._status_code = -1  # started
            entrypoint_str = (
                f", entrypoint='{entrypoint}'" if entrypoint is not None else ""
            )
            self._logging_message_track += f", started new Run('{run.uid}'{entrypoint_str}) at {format_field_value(run.started_at)}"
        # can only determine at ln.finish() if run was consecutive in
        # interactive session, otherwise, is consecutive
        run.is_consecutive = True if is_run_from_ipython else None
        if params is not None:
            run.params = serialize_params_to_json(params)
            self._logging_message_track += "\n→ params: " + ", ".join(
                f"{key}={value!r}" for key, value in run.params.items()
            )
        if cli_call is not None:
            _, cli_args = cli_call
            logger.important(f"script invoked with: {cli_args}")
            run.cli_args = cli_args
        run.save()  # need to save now
        if features is not None:
            run.features.add_values(features)
            self._logging_message_track += "\n→ features: " + ", ".join(
                f"{key}={value!r}" for key, value in features.items()
            )
        self._run = run
        track_python_environment(run)
        if self.project is not None:
            # to update a potential project link
            # is only necessary if transform is loaded rather than newly created
            # can be optimized by checking whether the transform is loaded, but it typically is
            self.transform.save()
        log_to_file = None
        if log_to_file is None:
            if stream_tracking is not None:
                log_to_file = stream_tracking
            else:
                # Script runs get stream tracking; decorator-based runs only when
                # stream_tracking is passed (flow=True from decorator).
                log_to_file = self.transform.kind == "script"
        if log_to_file:
            self._stream_tracker.start(run)
        logger.important(self._logging_message_track)
        if self._logging_message_imports:
            logger.important(self._logging_message_imports)
        if uid_was_none and self._path is not None:
            # Flow/step decorators set run.entrypoint. Show this recommendation only
            # for flows (`stream_tracking=True`) and suppress it for steps.
            if entrypoint is not None:
                if stream_tracking:
                    logger.important_hint(
                        f'recommendation: to identify the script across renames, pass the uid: @ln.flow(uid="{self.transform.uid[:-4]}")'
                    )
            else:
                notebook_or_script = (
                    "notebook" if self._transform.kind == "notebook" else "script"
                )
                r_or_python = "." if self._path.suffix in {".py", ".ipynb"} else "$"
                project_str = (
                    f', project="{project if isinstance(project, str) else project.name}"'
                    if project is not None
                    else ""
                )
                space_str = (
                    f', space="{space if isinstance(space, str) else space.name}"'
                    if space is not None
                    else ""
                )
                plan_str = (
                    f', plan="{plan if isinstance(plan, str) else plan.key}"'
                    if plan is not None
                    else ""
                )
                params_str = (
                    ", params={...}" if params is not None else ""
                )  # do not put the values because typically parameterized by user
                kwargs_str = f"{project_str}{space_str}{plan_str}{params_str}"
                logger.important_hint(
                    f'recommendation: to identify the {notebook_or_script} across renames, pass the uid: ln{r_or_python}track("{self.transform.uid[:-4]}"{kwargs_str})'
                )
        if (
            self.transform.kind == "script"
            and self._path is not None
            and not self._is_step_decorator_run
        ):
            save_context_core(
                run=run,
                transform=self.transform,
                filepath=self._path,
                message_prefix="monitor at",
            )

    def _track_notebook(
        self,
        *,
        path_str: str | Path | None,
        pypackages: bool | None = None,
    ) -> tuple[Path, str | None]:
        if path_str is None:
            path, self._notebook_runner = get_notebook_path()
        else:
            path = Path(path_str)
        if pypackages is None:
            pypackages = True
        description = None
        if path.suffix == ".ipynb" and path.stem.startswith("Untitled"):
            raise RuntimeError(
                "Your notebook file name is 'Untitled.ipynb', please rename it before tracking. You might have to re-start your notebook kernel."
            )
        path_str = path.as_posix()
        if path_str.startswith("/fileId="):
            logger.warning("tracking on Google Colab is experimental")
            path_str = get_notebook_key_colab()
            path = Path(path_str)
        else:
            from nbproject.dev import read_notebook
            from nbproject.dev._meta_live import get_title
            from nbproject.dev._pypackage import infer_pypackages

            try:
                nb = read_notebook(path_str)

                nbproject_title = get_title(nb)
                if nbproject_title is not None:
                    description = nbproject_title

                if pypackages:
                    self._logging_message_imports += (
                        "notebook imports:"
                        f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
                    )
            except Exception:
                logger.debug("reading the notebook file failed")
                pass
        return path, description

    def _process_aux_transform(
        self,
        aux_transform: Transform,
        transform_hash: str,
    ) -> tuple[str, Transform | None, str]:
        # first part of the if condition: no version bump, second part: version bump
        message = ""
        if (
            # if a user hasn't yet saved the transform source code AND is the same user
            (
                aux_transform.source_code is None
                and aux_transform.created_by_id == ln_setup.settings.user.id
            )
            # if the transform source code is unchanged
            # if aux_transform.kind == "notebook", we anticipate the user makes changes to the notebook source code
            # in an interactive session, hence we *pro-actively bump* the version number by setting `revises` / 'nbconvert' execution is NOT interactive
            # in the second part of the if condition even though the source code is unchanged at point of running track()
            or (
                aux_transform.hash == transform_hash
                and (
                    aux_transform.kind != "notebook"
                    or self._notebook_runner == "nbconvert"
                )
            )
        ):
            uid = aux_transform.uid
            return uid, aux_transform, message
        else:
            uid = f"{aux_transform.uid[:-4]}{increment_base62(aux_transform.uid[-4:])}"
            message = (
                f"found {aux_transform.kind} {aux_transform.key}, making new version"
            )
            if (
                aux_transform.hash == transform_hash
                and aux_transform.kind == "notebook"
            ):
                message += " -- anticipating changes"
            elif aux_transform.hash != transform_hash:
                message += (
                    ""  # could log "source code changed", but this seems too much
                )
            elif aux_transform.created_by_id != ln_setup.settings.user.id:
                message += (
                    f" -- {aux_transform.created_by.handle} already works on this draft"
                )
            return uid, None, message

    def _create_or_load_transform(
        self,
        *,
        description: str | None = None,
        transform_ref: str | None = None,
        transform_ref_type: str | None = None,
        transform_kind: TransformKind = None,
        key: str | None = None,
        source_code: str | None = None,
    ):
        source_code_to_store = source_code
        if source_code is not None:
            source_code_to_store, redaction_count = redact_secrets_in_source_code(
                source_code
            )
            if redaction_count > 0:
                logger.warning(
                    f"redacted {redaction_count} secret-looking assignment(s) before persisting transform source code"
                )
            transform_hash = hash_string(source_code)
        else:
            from .._finish import notebook_to_script

            if not self._path.suffix == ".ipynb":
                _, transform_hash, _ = hash_file(self._path)
            else:
                # need to convert to stripped py:percent format for hashing
                source_code_path = (
                    ln_setup.settings.cache_dir
                    / self._path.name.replace(".ipynb", ".py")
                )
                if (
                    self._path.exists()
                ):  # notebook kernel might be running on a different machine
                    notebook_to_script(description, self._path, source_code_path)
                    _, transform_hash, _ = hash_file(source_code_path)
                else:
                    logger.debug(
                        "skipping notebook hash comparison, notebook kernel running on a different machine"
                    )
                    transform_hash = None

        # see whether we find a transform with the exact same hash
        if transform_hash is not None:
            aux_transform = Transform.filter(hash=transform_hash).first()
        else:
            aux_transform = None

        # determine the transform key (only when path-based; key is required when source_code)
        if key is None:
            if ln_setup.settings.dev_dir is not None:
                try:
                    key = self._path.relative_to(ln_setup.settings.dev_dir).as_posix()
                except ValueError as e:
                    if "subpath" in str(e):
                        logger.warning(
                            f"Path {self._path} is not within the configured dev directory "
                            f"({ln_setup.settings.dev_dir}), falling back to using filename as transform key "
                            f"('{self._path.name}')."
                        )
                        key = self._path.name
                    else:
                        raise
            else:
                key = self._path.name
        # if the user did not pass a uid and there is no matching aux_transform
        # need to search for the transform based on the key
        if self.uid is None and aux_transform is None:

            class SlashCount(Func):
                template = "LENGTH(%(expressions)s) - LENGTH(REPLACE(%(expressions)s, '/', ''))"
                output_field = IntegerField()

            # we need to traverse from greater depth to shorter depth so that we match better matches first
            transforms = (
                Transform.filter(key__endswith=key, is_latest=True)
                .annotate(slash_count=SlashCount("key"))
                .order_by("-slash_count")
            )
            uid = f"{base62_12()}0000"
            target_transform = None
            if len(transforms) != 0:
                message = ""
                found_key = False
                if self._path is not None:
                    for aux_transform in transforms:
                        # check whether the transform key is in the path
                        # that's not going to be the case for keys that have "/" in them and don't match the folder
                        if aux_transform.key in self._path.as_posix():
                            key = aux_transform.key
                            uid, target_transform, message = (
                                self._process_aux_transform(
                                    aux_transform, transform_hash
                                )
                            )
                            found_key = True
                            break
                if not found_key:
                    plural_s = "s" if len(transforms) > 1 else ""
                    transforms_str = "\n".join(
                        [
                            f"    {transform.uid} → {transform.key}"
                            for transform in transforms
                        ]
                    )
                    message = f"ignoring transform{plural_s} with same filename in different folder:\n{transforms_str}"
                if message != "":
                    logger.important(message)
            self.uid, transform = uid, target_transform
        # the user did pass the uid
        elif self.uid is not None and len(self.uid) == 16:
            transform = Transform.filter(uid=self.uid).one_or_none()
        else:
            if self.uid is not None:
                # the case with length 16 is covered above
                if not len(self.uid) == 12:
                    raise InvalidArgument(
                        f'Please pass an auto-generated uid instead of "{self.uid}". Resolve by running: ln.track("{base62_12()}")'
                    )
                aux_transform = (
                    Transform.filter(uid__startswith=self.uid)
                    .order_by("-created_at")
                    .first()
                )
            else:
                # deal with a hash-based match
                # the user might have a made a copy of the notebook or script
                # and actually wants to create a new transform
                if aux_transform is not None and not aux_transform.key.endswith(key):
                    prompt = f"Found transform with same hash but different key: {aux_transform.key}. Did you rename your {transform_kind} to {key} (1) or intentionally made a copy (2)?"
                    response = (
                        "1" if os.getenv("LAMIN_TESTING") == "true" else input(prompt)
                    )
                    assert response in {"1", "2"}, (  # noqa: S101
                        f"Please respond with either 1 or 2, not {response}"
                    )
                    if response == "2":
                        aux_transform, transform_hash = (
                            None,
                            None,
                        )  # make a new transform
            if aux_transform is not None:
                uid, target_transform, message = self._process_aux_transform(
                    aux_transform, transform_hash
                )
                if message != "":
                    logger.important(message)
            else:
                uid = f"{self.uid}0000" if self.uid is not None else None
                target_transform = None
            self.uid, transform = uid, target_transform
        if self.version is not None:
            # test inconsistent version passed
            if (
                transform is not None
                and transform.version_tag is not None  # type: ignore
                and self.version != transform.version_tag  # type: ignore
            ):
                raise ValueError(
                    f"Transform is already tagged with version {transform.version_tag}, but you passed {self.version}\n"  # noqa: S608
                    f"If you want to update the transform version, set it outside ln.track(): transform.version_tag = '{self.version}'; transform.save()"
                )
            # test whether version was already used for another member of the family
            if self.uid is not None and len(self.uid) == 16:
                suid, vuid = (self.uid[:-4], self.uid[-4:])
                transform = Transform.filter(
                    uid__startswith=suid, version_tag=self.version
                ).one_or_none()
                if transform is not None and vuid != transform.uid[-4:]:
                    better_version = bump_version_function(self.version)
                    raise SystemExit(
                        f"✗ version '{self.version}' is already taken by Transform('{transform.uid}'); please set another version, e.g., ln.context.version = '{better_version}'"
                    )
        # make a new transform record
        if transform is None:
            assert key is not None  # noqa: S101
            transform = Transform(  # type: ignore
                uid=self.uid,
                version_tag=self.version,
                description=description,
                key=key,
                reference=transform_ref,
                reference_type=transform_ref_type,
                kind=transform_kind,
                source_code=source_code_to_store,
                skip_hash_lookup=source_code is not None,
            )
            if source_code is not None:
                transform.hash = transform_hash
            transform = transform.save()
            self._logging_message_track += (
                f"created Transform('{transform.uid}', key='{transform.key}')"
            )
        else:
            uid = transform.uid
            # transform was already saved via `finish()`
            transform_was_saved = transform.source_code is not None
            # check whether the transform.key is consistent
            if transform.key != key:
                self._logging_message_track += (
                    f"renaming transform {transform.key} to {key}"
                )
                transform.key = key
                transform.save()
            elif transform.description != description and description is not None:
                transform.description = description
                transform.save()
                self._logging_message_track += (
                    "updated transform description, "  # white space on purpose
                )
            elif (
                transform.created_by_id != ln_setup.settings.user.id
                and not transform_was_saved
            ):
                raise UpdateContext(
                    f'{transform.created_by.name} ({transform.created_by.handle}) already works on this draft {transform.kind}.\n\nPlease create a revision via `ln.track("{uid[:-4]}{increment_base62(uid[-4:])}")` or a new transform with a *different* key and `ln.track("{base62_12()}0000")`.'
                )
            if transform.reference != transform_ref:
                transform.reference = transform_ref
                transform.reference_type = transform_ref_type
                transform.save()
                self._logging_message_track += (
                    "updated transform reference, "  # white space on purpose
                )
            # check whether transform source code was already saved
            if transform_was_saved:
                bump_revision = False
                if (
                    transform.kind == "notebook"
                    and self._notebook_runner != "nbconvert"
                ):
                    # we anticipate the user makes changes to the notebook source code
                    # in an interactive session, hence we pro-actively bump the version number
                    bump_revision = True
                else:
                    if transform_hash != transform.hash:
                        bump_revision = True
                    else:
                        self._logging_message_track += f"loaded Transform('{transform.uid}', key='{transform.key}')"
                if bump_revision:
                    change_type = (
                        "re-running notebook with already-saved source code"
                        if (
                            transform.kind == "notebook"
                            and self._notebook_runner != "nbconvert"
                        )
                        else "source code changed"
                    )
                    raise UpdateContext(
                        f'✗ {change_type}, please update the `uid` argument in `track()` to "{uid[:-4]}{increment_base62(uid[-4:])}"'
                    )
            else:
                self._logging_message_track += (
                    f"loaded Transform('{transform.uid}', key='{transform.key}')"
                )
        self._transform = transform

    def _finish(self, ignore_non_consecutive: None | bool = None) -> None:
        """Finish the run of a notebook or script.

        - writes a timestamp: `run.finished_at`
        - saves the source code if it is not yet saved: `transform.source_code`
        - saves a run report: `run.report`

        When called in a notebook, will prompt to save the notebook in your editor.

        Args:
            ignore_non_consecutive: Whether to ignore if a notebook was non-consecutively executed.

        Examples:

            See :doc:`/track`.

        See Also:
            `lamin save script.py` or `lamin save notebook.ipynb` → `docs </cli#lamin-save>`__

        """
        from .._finish import save_context_core, save_run_logs

        if self.run is None:
            raise TrackNotCalled("Please run `ln.track()` before `ln.finish()`")
        if self._path is None:
            if self.run.transform.kind in {"script", "notebook"}:
                raise ValueError(
                    "Transform type is not allowed to be 'script' or 'notebook' because `context._path` is `None`."
                )
            self.run.finished_at = datetime.now(timezone.utc)
            self.run.save()
            # reset context so the next _track() starts clean (e.g. from decorator)
            self._uid = None
            self._run = None
            self._transform = None
            self._version = None
            self._description = None
            self._is_step_decorator_run = False
            return None
        self.run._status_code = 0
        if self.transform.kind == "notebook":
            return_code = save_context_core(
                run=self.run,
                transform=self.run.transform,
                filepath=self._path,
                finished_at=True,
                ignore_non_consecutive=ignore_non_consecutive,
                is_retry=self._is_finish_retry,
                notebook_runner=self._notebook_runner,
            )
            if return_code == "retry":
                self._is_finish_retry = True
                return None
        else:
            self.run.finished_at = datetime.now(timezone.utc)
            self.run.save()  # persist finished_at (save_run_logs only saves when log file exists)
            if ln_setup.settings.instance.is_on_hub and not self._is_step_decorator_run:
                instance_slug = ln_setup.settings.instance.slug
                ui_url = ln_setup.settings.instance.ui_url
                logger.important(
                    f"go to: {ui_url}/{instance_slug}/transform/{self.transform.uid}"
                )
            save_run_logs(self.run, save_run=True)
            self._stream_tracker.finish()
        # reset the context attributes so that somebody who runs `track()` after finish
        # starts fresh
        self._uid = None
        self._run = None
        self._transform = None
        self._version = None
        self._description = None
        self._is_step_decorator_run = False


context: Context = Context()


================================================
FILE: lamindb/core/_functions.py
================================================
import functools
import inspect
from contextvars import ContextVar
from datetime import datetime, timezone
from pathlib import Path
from typing import Callable, Literal, ParamSpec, TypeVar

from lamindb.base import deprecated

from ..models import Run
from ._context import Context, get_key_from_module
from ._context import context as global_context

P = ParamSpec("P")
R = TypeVar("R")

# Create a context variable to store the current tracked run
current_tracked_run: ContextVar[Run | None] = ContextVar(
    "current_tracked_run", default=None
)


def get_current_tracked_run() -> Run | None:
    """Get the run object."""
    run = current_tracked_run.get()
    if run is None:
        run = global_context.run
    return run


def _create_tracked_decorator(
    uid: str | None = None,
    is_flow: bool = True,
    global_run: Literal["memorize", "clear", "none"] = "none",
    track_arg_aliases: bool = False,
) -> Callable[[Callable[P, R]], Callable[P, R]]:
    """Internal helper to create tracked decorators.

    Args:
        uid: Persist the uid to identify this transform across renames.
        is_flow: Triggered through @ln.flow(), otherwise @ln.step().
    """

    def decorator_tracked(func: Callable[P, R]) -> Callable[P, R]:
        # Get the original signature
        sig = inspect.signature(func)

        @functools.wraps(func)
        def wrapper_tracked(*args: P.args, **kwargs: P.kwargs) -> R:
            if global_context.run is None:
                if not is_flow:
                    raise RuntimeError(
                        "Please track the global run context before using @ln.step(): ln.track() or @ln.flow()"
                    )
            else:
                if is_flow:
                    raise RuntimeError(
                        "Please use @ln.step() or clear the global run context before using @ln.flow(): no `ln.track()` or `@ln.flow(global_run='clear')`"
                    )
            bound_args = sig.bind(*args, **kwargs)
            bound_args.apply_defaults()
            params = dict(bound_args.arguments)

            initiated_by_run = get_current_tracked_run()
            track_kwargs: dict = {}
            if track_arg_aliases:
                for key in ("project", "space", "branch", "plan", "initiated_by_run"):
                    if key in params and params[key] is not None:
                        track_kwargs[key] = params[key]
                if "initiated_by_run" in track_kwargs:
                    initiated_by_run = track_kwargs["initiated_by_run"]
            path_raw = inspect.getsourcefile(func)
            path = None
            # do not pass path when function is defined in an ipython cell
            if path_raw is not None and Path(path_raw).exists():
                path = Path(path_raw)
            source_code = inspect.getsource(func) if path is None else None
            transform_kind: Literal["function", "script"] = (
                "function" if path is None else "script"
            )
            caller_module = func.__module__
            key = get_key_from_module(caller_module)
            if (
                key is None
                and path is None
                and caller_module in {"__main__", "__mp_main__"}
            ):
                key = f"{initiated_by_run.transform.key}"
            context = Context(uid=uid, path=path)
            context._track(
                uid,
                path=path,
                key=key,
                source_code=source_code,
                kind=transform_kind,
                entrypoint=func.__qualname__,
                params=params,
                new_run=True,
                project=track_kwargs.get("project"),
                space=track_kwargs.get("space"),
                branch=track_kwargs.get("branch"),
                plan=track_kwargs.get("plan"),
                initiated_by_run=initiated_by_run,
                stream_tracking=is_flow,
            )
            token = current_tracked_run.set(context.run)
            if global_run in {"memorize", "clear"}:
                global_context._run = context.run
            try:
                result = func(*args, **kwargs)
                context._finish()
                return result
            except Exception as e:
                run = context.run
                run.finished_at = datetime.now(timezone.utc)
                run._status_code = 1  # errored
                run.save()
                raise e
            finally:
                if (
                    global_run == "clear"
                    and global_context.run == current_tracked_run.get()
                ):
                    global_context._run = None
                current_tracked_run.reset(token)

        return wrapper_tracked

    return decorator_tracked


def flow(
    uid: str | None = None,
    global_run: Literal["memorize", "clear", "none"] = "clear",
    track_arg_aliases: bool = True,
) -> Callable[[Callable[P, R]], Callable[P, R]]:
    """Use `@flow()` to track a function as a workflow.

    You will be able to see inputs, outputs, and parameters of the function in the data lineage graph.

    The decorator creates a :class:`~lamindb.Transform` with kind `"script"` that maps onto the file in
    which the function is defined.
    The function maps onto an entrypoint of the `transform`.
    A function execution creates a :class:`~lamindb.Run` object that stores the function name in `run.entrypoint`.
    If the function is defined in a notebook cell or another ephemeral context, the transform is created with kind `"function"`.

    By default `@ln.flow()`, like `ln.track()`, creates a global run context that can be accessed with `ln.context.run`.

    Args:
        uid: Persist the uid to identify a transform across renames.
        global_run: If `"clear"`, set the global run context `ln.context.run` and clear after the function completes.
            If `"memorize"`, set the global run context and do not clear after the function completes.
            Set this to `"none"` if you want to track concurrent executions of a `flow()` in the same Python process.
        track_arg_aliases: If `True` (default), maps function arguments with names `project`, `space`, `branch`,
            `plan`, and `initiated_by_run` to matching `ln.track()` arguments while also keeping them in `run.params`
            for reproducibility. Pass `False` to disable this mapping.

    Examples:

        To sync a workflow with a file in a git repo, see: :ref:`sync-code-with-git`.

        For an extensive guide, see: :ref:`manage-workflows`. Here follow some examples.

        .. literalinclude:: scripts/my_workflow.py
            :language: python
            :caption: my_workflow.py

        .. literalinclude:: scripts/my_workflow_with_step.py
            :language: python
            :caption: my_workflow_with_step.py

        .. literalinclude:: scripts/my_workflow_with_click.py
            :language: python
            :caption: my_workflow_with_click.py


    """
    return _create_tracked_decorator(
        uid=uid,
        is_flow=True,
        global_run=global_run,
        track_arg_aliases=track_arg_aliases,
    )


def step(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]]:
    """Use `@step()` to track a function as a step.

    Behaves like :func:`~lamindb.flow()`, but acts as a step in a workflow and does
    not create a global run context.
    It errors if no initiating run (either global or local run context) exists.

    See :func:`~lamindb.flow()` for examples.

    Args:
        uid: Persist the uid to identify a transform across renames.
    """
    return _create_tracked_decorator(uid=uid, is_flow=False)


@deprecated("step")
def tracked(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]]:
    return step(uid)


================================================
FILE: lamindb/core/_mapped_collection.py
================================================
from __future__ import annotations

from collections import Counter
from functools import reduce
from typing import TYPE_CHECKING, Literal

import numpy as np
import pandas as pd
from lamin_utils import logger
from lamindb_setup.core.upath import UPath

from .storage._anndata_accessor import (
    ArrayType,
    ArrayTypes,
    GroupType,
    GroupTypes,
    StorageType,
    _safer_read_index,
    get_spec,
    registry,
)

if TYPE_CHECKING:
    from lamindb_setup.types import AnyPathStr


class _Connect:
    def __init__(self, storage):
        if isinstance(storage, UPath):
            # force no external compression even for files with .gz extension. REMOVE LATER
            self.conn, self.store = registry.open("h5py", storage, compression=None)
            self.to_close = True
        else:
            self.conn, self.store = None, storage
            self.to_close = False

    def __enter__(self):
        return self.store

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def close(self):
        if not self.to_close:
            return
        if hasattr(self.store, "close"):
            self.store.close()
        if hasattr(self.conn, "close"):
            self.conn.close()


_decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)


class MappedCollection:
    """Map-style collection for use in data loaders.

    This class virtually concatenates `AnnData` arrays as a `pytorch map-style dataset
    <https://pytorch.org/docs/stable/data.html#map-style-datasets>`__.

    If your `AnnData` collection is in the cloud, move them into a local cache
    first for faster access.

    `__getitem__` of the `MappedCollection` object takes a single integer index
    and returns a dictionary with the observation data sample for this index from
    the `AnnData` objects in `path_list`. The dictionary has keys for `layers_keys`
    (`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"`
    for the index of the `AnnData` object containing this observation sample.

    .. note::

        For a guide, see :doc:`docs:scrna-mappedcollection`.

        For more convenient use within :class:`~lamindb.core.MappedCollection`,
        see :meth:`~lamindb.Collection.mapped`.

        This currently only works for collections of `AnnData` objects.

        The implementation was influenced by the `SCimilarity
        <https://github.com/Genentech/scimilarity>`__ data loader.


    Args:
        path_list: A list of paths to `AnnData` objects stored in `.h5ad` or `.zarr` formats.
        layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
            retrieves ``.X``. ``"raw.X"`` retrieves ``.X`` from ``.raw`` slot.
            Keys not present in an object are omitted from the output for that object.
        obsm_keys: Keys from the ``.obsm`` slots. Keys not present in an object are
            omitted from the output for that object.
        obs_keys: Keys from the ``.obs`` slots. Keys not present in an object are
            omitted from the output for that object.
        obs_filter: Select only observations with these values for the given obs columns.
            Should be a dictionary with obs column names as keys
            and filtering values (a string or a list of strings) as values.
        join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
            does not join. The join is applied to ``layers_keys`` except for ``"raw.X"``.
        encode_labels: Encode labels into integers.
            Can be a list with elements from ``obs_keys``.
        unknown_label: Encode this label to -1.
            Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``
            or from ``encode_labels`` if it is a list.
        cache_categories: Enable caching categories of ``obs_keys`` for faster access.
        parallel: Enable sampling with multiple processes.
        dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
    """

    def __init__(
        self,
        path_list: list[AnyPathStr],
        layers_keys: str | list[str] | None = None,
        obs_keys: str | list[str] | None = None,
        obsm_keys: str | list[str] | None = None,
        obs_filter: dict[str, str | list[str]] | None = None,
        join: Literal["inner", "outer"] | None = "inner",
        encode_labels: bool | list[str] = True,
        unknown_label: str | dict[str, str] | None = None,
        cache_categories: bool = True,
        parallel: bool = False,
        dtype: str | None = None,
    ):
        if join not in {None, "inner", "outer"}:  # pragma: nocover
            raise ValueError(
                f"join must be one of None, 'inner, or 'outer' but was {type(join)}"
            )

        self.filtered = obs_filter is not None
        if self.filtered and not isinstance(obs_filter, dict):
            logger.warning(
                "Passing a tuple to `obs_filter` is deprecated, use a dictionary"
            )
            obs_filter = {obs_filter[0]: obs_filter[1]}

        if layers_keys is None:
            self.layers_keys = ["X"]
        else:
            self.layers_keys = (
                [layers_keys] if isinstance(layers_keys, str) else layers_keys
            )

        obsm_keys = [obsm_keys] if isinstance(obsm_keys, str) else obsm_keys
        self.obsm_keys = obsm_keys

        obs_keys = [obs_keys] if isinstance(obs_keys, str) else obs_keys
        self.obs_keys = obs_keys

        if isinstance(encode_labels, list):
            if len(encode_labels) == 0:
                encode_labels = False
            elif obs_keys is None or not all(
                enc_label in obs_keys for enc_label in encode_labels
            ):
                raise ValueError(
                    "All elements of `encode_labels` should be in `obs_keys`."
                )
        else:
            if encode_labels:
                encode_labels = obs_keys if obs_keys is not None else False
        self.encode_labels = encode_labels

        if encode_labels and isinstance(unknown_label, dict):
            if not all(unkey in encode_labels for unkey in unknown_label):  # type: ignore
                raise ValueError(
                    "All keys of `unknown_label` should be in `encode_labels` and `obs_keys`."
                )
        self.unknown_label = unknown_label

        self.storages = []  # type: ignore
        self.conns = []  # type: ignore
        self.parallel = parallel
        self.path_list = path_list
        self._make_connections(path_list, parallel)

        self._cache_has_raw: list[bool] = []
        self._cache_obsm_keys: list[set[str]] = []
        self._cache_obs_keys: list[set[str]] = []
        self._cache_layers_keys: list[set[str]] = []
        self._cache_keys()

        self._cache_cats: dict = {}
        if self.obs_keys is not None:
            if cache_categories:
                self._cache_categories(self.obs_keys)
            self.encoders: dict = {}
            if self.encode_labels:
                self._make_encoders(self.encode_labels)  # type: ignore

        self.n_obs_list = []
        self.indices_list = []
        for i, storage in enumerate(self.storages):
            with _Connect(storage) as store:
                X = store["X"]
                store_path = self.path_list[i]
                self._check_csc_raise_error(X, "X", store_path)
                if isinstance(X, ArrayTypes):  # type: ignore
                    n_obs_storage = X.shape[0]
                else:
                    n_obs_storage = X.attrs["shape"][0]
                if self.filtered:
                    indices_storage_mask = None
                    for obs_filter_key, obs_filter_values in obs_filter.items():
                        if isinstance(obs_filter_values, tuple):
                            obs_filter_values = list(obs_filter_values)
                        elif not isinstance(obs_filter_values, list):
                            obs_filter_values = [obs_filter_values]
                        if obs_filter_key in store["obs"]:
                            obs_labels = self._get_labels(store, obs_filter_key)
                            obs_filter_mask = np.isin(obs_labels, obs_filter_values)
                        else:
                            obs_filter_mask = np.full(n_obs_storage, False)
                        if pd.isna(obs_filter_values).any():
                            obs_filter_mask |= pd.isna(obs_labels)
                        if indices_storage_mask is None:
                            indices_storage_mask = obs_filter_mask
                        else:
                            indices_storage_mask &= obs_filter_mask
                    indices_storage = np.where(indices_storage_mask)[0]
                    n_obs_storage = len(indices_storage)
                else:
                    indices_storage = np.arange(n_obs_storage)
                self.n_obs_list.append(n_obs_storage)
                self.indices_list.append(indices_storage)
                for layer_key in self.layers_keys:
                    if layer_key == "X":
                        continue
                    lazy_data = self._get_lazy_data(store, layer_key, i)
                    if lazy_data is None:
                        continue
                    self._check_csc_raise_error(
                        lazy_data,
                        "raw.X" if layer_key == "raw.X" else f"layers/{layer_key}",
                        store_path,
                    )
                if self.obsm_keys is not None:
                    for obsm_key in self.obsm_keys:
                        if obsm_key in self._cache_obsm_keys[i]:
                            self._check_csc_raise_error(
                                store["obsm"][obsm_key],
                                f"obsm/{obsm_key}",
                                store_path,
                            )
        self.n_obs = sum(self.n_obs_list)

        self.indices = np.hstack(self.indices_list)
        self.storage_idx = np.repeat(np.arange(len(self.storages)), self.n_obs_list)

        self.join_vars: Literal["inner", "outer"] | None = join
        self.var_indices: list | None = None
        self.var_joint: pd.Index | None = None
        self.n_vars_list: list | None = None
        self.var_list: list | None = None
        self.n_vars: int | None = None
        if self.join_vars is not None:
            self._make_join_vars()
            self.n_vars = len(self.var_joint)

        self._dtype = dtype
        self._closed = False

    def _make_connections(self, path_list: list, parallel: bool):
        for path in path_list:
            path = UPath(path)
            if path.exists() and path.is_file():  # type: ignore
                if parallel:
                    conn, storage = None, path
                else:
                    # force no external compression even for files with .gz extension. REMOVE LATER
                    conn, storage = registry.open("h5py", path, compression=None)
            else:
                conn, storage = registry.open("zarr", path)
            self.conns.append(conn)
            self.storages.append(storage)

    def _cache_keys(self):
        for storage in self.storages:
            with _Connect(storage) as store:
                store_keys = registry.keys(store)
                self._cache_has_raw.append("raw" in store_keys)
                for group in ("obsm", "obs", "layers"):
                    cache = getattr(self, f"_cache_{group}_keys")
                    cache.append(
                        set(store_keys[group]) if group in store_keys else set()
                    )

    def _cache_categories(self, obs_keys: list):
        self._cache_cats = {}
        for label in obs_keys:
            self._cache_cats[label] = []
            for i, storage in enumerate(self.storages):
                if label not in self._cache_obs_keys[i]:
                    self._cache_cats[label].append(None)
                    continue
                with _Connect(storage) as store:
                    cats = self._get_categories(store, label)
                    if cats is not None:
                        cats = (
                            _decode(cats) if isinstance(cats[0], bytes) else cats[...]
                        )
                    self._cache_cats[label].append(cats)

    def _make_encoders(self, encode_labels: list):
        for label in encode_labels:
            cats = self.get_merged_categories(label)
            encoder = {}
            if isinstance(self.unknown_label, dict):
                unknown_label = self.unknown_label.get(label, None)
            else:
                unknown_label = self.unknown_label
            if unknown_label is not None and unknown_label in cats:
                cats.remove(unknown_label)
                encoder[unknown_label] = -1
            encoder.update({cat: i for i, cat in enumerate(cats)})
            self.encoders[label] = encoder

    def _read_vars(self):
        self.var_list = []
        self.n_vars_list = []
        for storage in self.storages:
            with _Connect(storage) as store:
                vars = _safer_read_index(store["var"])
                self.var_list.append(vars)
                self.n_vars_list.append(len(vars))

    def _make_join_vars(self):
        if self.var_list is None:
            self._read_vars()
        vars_eq = all(self.var_list[0].equals(vrs) for vrs in self.var_list[1:])
        if vars_eq:
            self.join_vars = None
            self.var_joint = self.var_list[0]
            return

        if self.join_vars == "inner":
            self.var_joint = reduce(pd.Index.intersection, self.var_list)
            if len(self.var_joint) == 0:
                raise ValueError(
                    "The provided AnnData objects don't have shared variables.\n"
                    "Use join='outer'."
                )
            self.var_indices = [
                vrs.get_indexer(self.var_joint) for vrs in self.var_list
            ]
        elif self.join_vars == "outer":
            self.var_joint = reduce(pd.Index.union, self.var_list)
            self.var_indices = [
                self.var_joint.get_indexer(vrs) for vrs in self.var_list
            ]

    def check_vars_sorted(self, ascending: bool = True) -> bool:
        """Returns `True` if all variables are sorted in all objects."""
        if self.var_list is None:
            self._read_vars()
        if ascending:
            vrs_sort_status = (vrs.is_monotonic_increasing for vrs in self.var_list)
        else:
            vrs_sort_status = (vrs.is_monotonic_decreasing for vrs in self.var_list)
        return all(vrs_sort_status)

    def check_vars_non_aligned(self, vars: pd.Index | list) -> list[int]:
        """Returns indices of objects with non-aligned variables.

        Args:
            vars: Check alignment against these variables.
        """
        if self.var_list is None:
            self._read_vars()
        vars = pd.Index(vars)
        return [i for i, vrs in enumerate(self.var_list) if not vrs.equals(vars)]

    def _check_csc_raise_error(
        self, elem: GroupType | ArrayType, key: str, path: AnyPathStr
    ):
        if isinstance(elem, ArrayTypes):  # type: ignore
            return
        if get_spec(elem).encoding_type == "csc_matrix":
            if not self.parallel:
                self.close()
            raise ValueError(
                f"{key} in {path} is a csc matrix, `MappedCollection` doesn't support this format yet."
            )

    def __len__(self):
        return self.n_obs

    @property
    def shape(self) -> tuple[int, int]:
        """Shape of the (virtually aligned) dataset."""
        return (self.n_obs, self.n_vars)

    @property
    def original_shapes(self) -> list[tuple[int, int]]:
        """Shapes of the underlying AnnData objects (with `obs_filter` applied)."""
        if self.n_vars_list is None:
            n_vars_list = [None] * len(self.n_obs_list)
        else:
            n_vars_list = self.n_vars_list
        return list(zip(self.n_obs_list, n_vars_list))

    def __getitem__(self, idx: int):
        obs_idx = self.indices[idx]
        storage_idx = self.storage_idx[idx]
        if self.var_indices is not None:
            var_idxs_join = self.var_indices[storage_idx]
        else:
            var_idxs_join = None
        out = {"_store_idx": storage_idx}
        with _Connect(self.storages[storage_idx]) as store:
            for layers_key in self.layers_keys:
                lazy_data = self._get_lazy_data(store, layers_key, storage_idx)
                if lazy_data is None:
                    continue
                # do not apply join to raw.X, return as is
                join_vars = None if layers_key == "raw.X" else self.join_vars
                out[layers_key] = self._get_data_idx(
                    lazy_data, obs_idx, join_vars, var_idxs_join, self.n_vars
                )
            if self.obsm_keys is not None:
                for obsm_key in self.obsm_keys:
                    if obsm_key not in self._cache_obsm_keys[storage_idx]:
                        continue
                    lazy_data = store["obsm"][obsm_key]
                    out[f"obsm_{obsm_key}"] = self._get_data_idx(lazy_data, obs_idx)
            if self.obs_keys is not None:
                for label in self.obs_keys:
                    if label not in self._cache_obs_keys[storage_idx]:
                        continue
                    if label in self._cache_cats:
                        cats = self._cache_cats[label][storage_idx]
                        if cats is None:
                            cats = []
                    else:
                        cats = None
                    label_idx = self._get_obs_idx(store, obs_idx, label, cats)
                    if label in self.encoders and label_idx is not np.nan:
                        label_idx = self.encoders[label][label_idx]
                    out[label] = label_idx
        return out

    def _get_lazy_data(self, store: StorageType, layers_key: str, storage_idx: int):
        if layers_key == "X":
            lazy_data = store["X"]  # type: ignore
        elif layers_key == "raw.X" and self._cache_has_raw[storage_idx]:
            lazy_data = store["raw"]["X"]  # type: ignore
        elif layers_key in self._cache_layers_keys[storage_idx]:
            lazy_data = store["layers"][layers_key]  # type: ignore
        else:
            lazy_data = None
        return lazy_data

    def _get_data_idx(
        self,
        lazy_data: ArrayType | GroupType,
        idx: int,
        join_vars: Literal["inner", "outer"] | None = None,
        var_idxs_join: list | None = None,
        n_vars_out: int | None = None,
    ):
        """Get the index for the data."""
        if isinstance(lazy_data, ArrayTypes):  # type: ignore
            lazy_data_idx = lazy_data[idx]  # type: ignore
            if join_vars is None:
                result = lazy_data_idx
                if self._dtype is not None:
                    result = result.astype(self._dtype, copy=False)
            elif join_vars == "outer":
                dtype = lazy_data_idx.dtype if self._dtype is None else self._dtype
                result = np.zeros(n_vars_out, dtype=dtype)
                result[var_idxs_join] = lazy_data_idx
            else:  # inner join
                result = lazy_data_idx[var_idxs_join]
                if self._dtype is not None:
                    result = result.astype(self._dtype, copy=False)
            return result
        else:  # assume csr_matrix here
            data = lazy_data["data"]  # type: ignore
            indices = lazy_data["indices"]  # type: ignore
            indptr = lazy_data["indptr"]  # type: ignore
            s = slice(*(indptr[idx : idx + 2]))
            data_s = data[s]
            dtype = data_s.dtype if self._dtype is None else self._dtype
            if join_vars == "outer":
                lazy_data_idx = np.zeros(n_vars_out, dtype=dtype)
                lazy_data_idx[var_idxs_join[indices[s]]] = data_s
            else:
                lazy_data_idx = np.zeros(lazy_data.attrs["shape"][1], dtype=dtype)  # type: ignore
                lazy_data_idx[indices[s]] = data_s
                if join_vars == "inner":
                    lazy_data_idx = lazy_data_idx[var_idxs_join]
            return lazy_data_idx

    def _get_obs_idx(
        self,
        storage: StorageType,
        idx: int,
        label_key: str,
        categories: list | None = None,
    ):
        """Get the index for the label by key."""
        obs = storage["obs"]  # type: ignore
        # how backwards compatible do we want to be here actually?
        if isinstance(obs, ArrayTypes):  # type: ignore
            label = obs[idx][obs.dtype.names.index(label_key)]
        else:
            labels = obs[label_key]
            if isinstance(labels, ArrayTypes):  # type: ignore
                label = labels[idx]
            else:
                label = labels["codes"][idx]
                if label == -1:
                    return np.nan
        if categories is not None:
            cats = categories
        else:
            cats = self._get_categories(storage, label_key)
        if cats is not None and len(cats) > 0:
            label = cats[label]
        if isinstance(label, bytes):
            label = label.decode("utf-8")
        return label

    def get_label_weights(
        self,
        obs_keys: str | list[str],
        scaler: float | None = None,
        return_categories: bool = False,
    ):
        """Get all weights for the given label keys.

        This counts the number of labels for each label and returns
        weights for each obs label accoding to the formula `1 / num of this label in the data`.
        If `scaler` is provided, then `scaler / (scaler + num of this label in the data)`.

        Args:
            obs_keys: A key in the ``.obs`` slots or a list of keys. If a list is provided,
                the labels from the obs keys will be concatenated with ``"__"`` delimeter
            scaler: Use this number to scale the provided weights.
            return_categories: If `False`, returns weights for each observation,
                can be directly passed to a sampler. If `True`, returns a dictionary with
                unique categories for labels (concatenated if `obs_keys` is a list)
                and their weights.
        """
        if isinstance(obs_keys, str):
            obs_keys = [obs_keys]
        labels_list = []
        for label_key in obs_keys:
            labels_to_str = self.get_merged_labels(label_key).astype(str).astype("O")
            labels_list.append(labels_to_str)
        if len(labels_list) > 1:
            labels = ["__".join(labels_obs) for labels_obs in zip(*labels_list)]
        else:
            labels = labels_list[0]
        counter = Counter(labels)
        if return_categories:
            return {
                k: 1.0 / v if scaler is None else scaler / (v + scaler)
                for k, v in counter.items()
            }
        counts = np.array([counter[label] for label in labels])
        if scaler is None:
            weights = 1.0 / counts
        else:
            weights = scaler / (counts + scaler)
        return weights

    def get_merged_labels(self, label_key: str):
        """Get merged labels for `label_key` from all `.obs`."""
        labels_merge = []
        for i, storage in enumerate(self.storages):
            with _Connect(storage) as store:
                if label_key not in self._cache_obs_keys[i]:
                    continue
                labels = self._get_labels(store, label_key, storage_idx=i)
                if self.filtered:
                    labels = labels[self.indices_list[i]]
                labels_merge.append(labels)
        return np.hstack(labels_merge)

    def get_merged_categories(self, label_key: str):
        """Get merged categories for `label_key` from all `.obs`."""
        cats_merge = set()
        for i, storage in enumerate(self.storages):
            with _Connect(storage) as store:
                if label_key not in self._cache_obs_keys[i]:
                    continue
                if label_key in self._cache_cats:
                    cats = self._cache_cats[label_key][i]
                else:
                    cats = self._get_categories(store, label_key)
                if cats is not None:
                    cats = _decode(cats) if isinstance(cats[0], bytes) else cats
                    cats_merge.update(cats)
                else:
                    codes = self._get_codes(store, label_key)
                    codes = _decode(codes) if isinstance(codes[0], bytes) else codes
                    cats_merge.update(codes)
        return sorted(cats_merge)

    def _get_categories(self, storage: StorageType, label_key: str):
        """Get categories."""
        obs = storage["obs"]  # type: ignore
        if isinstance(obs, ArrayTypes):  # type: ignore
            cat_key_uns = f"{label_key}_categories"
            if cat_key_uns in storage["uns"]:  # type: ignore
                return storage["uns"][cat_key_uns]  # type: ignore
            else:
                return None
        else:
            if "__categories" in obs:
                cats = obs["__categories"]
                if label_key in cats:
                    return cats[label_key]
                else:
                    return None
            if label_key not in obs:
                return None
            labels = obs[label_key]
            if isinstance(labels, GroupTypes):  # type: ignore
                if "categories" in labels:
                    return labels["categories"]
                else:
                    return None
            else:
                if "categories" in labels.attrs:
                    return labels.attrs["categories"]
                else:
                    return None
        return None

    def _get_codes(self, storage: StorageType, label_key: str):
        """Get codes."""
        obs = storage["obs"]  # type: ignore
        if isinstance(obs, ArrayTypes):  # type: ignore
            label = obs[label_key]
        else:
            label = obs[label_key]
            if isinstance(label, ArrayTypes):  # type: ignore
                return label[...]
            else:
                return label["codes"][...]

    def _get_labels(
        self, storage: StorageType, label_key: str, storage_idx: int | None = None
    ):
        """Get labels."""
        codes = self._get_codes(storage, label_key)
        labels = _decode(codes) if isinstance(codes[0], bytes) else codes
        if storage_idx is not None and label_key in self._cache_cats:
            cats = self._cache_cats[label_key][storage_idx]
        else:
            cats = self._get_categories(storage, label_key)
        if cats is not None:
            cats = _decode(cats) if isinstance(cats[0], bytes) else cats
            # NaN is coded as -1
            nans = labels == -1
            labels = cats[labels]
            # detect and replace nans
            if nans.any():
                labels[nans] = np.nan

        return labels

    def close(self):
        """Close connections to array streaming backend.

        No effect if `parallel=True`.
        """
        for storage in self.storages:
            if hasattr(storage, "close"):
                storage.close()
        for conn in self.conns:
            if hasattr(conn, "close"):
                conn.close()
        self._closed = True

    @property
    def closed(self) -> bool:
        """Check if connections to array streaming backend are closed.

        Does not matter if `parallel=True`.
        """
        return self._closed

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    @classmethod
    def torch_worker_init_fn(cls, worker_id):
        """`worker_init_fn` for `torch.utils.data.DataLoader`.

        Improves performance for `num_workers > 1`.
        """
        from torch.utils.data import get_worker_info

        mapped = get_worker_info().dataset
        mapped.parallel = False
        mapped.storages = []
        mapped.conns = []
        mapped._make_connections(mapped.path_list, parallel=False)


================================================
FILE: lamindb/core/_settings.py
================================================
from __future__ import annotations

import os
import sys
from typing import TYPE_CHECKING

import lamindb_setup as ln_setup
from lamin_utils import colors, logger
from lamindb_setup import settings as setup_settings
from lamindb_setup._set_managed_storage import set_managed_storage
from lamindb_setup.core._settings_instance import sanitize_git_repo_url
from lamindb_setup.core._settings_storage import (
    StorageSettings,
    convert_root_path_to_str,
)

from .subsettings._annotation_settings import AnnotationSettings, annotation_settings
from .subsettings._creation_settings import CreationSettings, creation_settings

if TYPE_CHECKING:
    from collections.abc import Mapping
    from pathlib import Path

    from lamindb_setup.types import AnyPathStr
    from upath import UPath


VERBOSITY_TO_INT = {
    "error": 0,  # 40
    "warning": 1,  # 30
    "success": 2,  # 25
    "info": 3,  # 20
    "hint": 4,  # 15
    "debug": 5,  # 10
}
VERBOSITY_TO_STR: dict[int, str] = dict(
    [reversed(i) for i in VERBOSITY_TO_INT.items()]  # type: ignore
)


def raise_if_storage_managed_by_other_instance(storage) -> None:
    storage_instance_uid = storage.instance_uid
    if storage_instance_uid != setup_settings.instance.uid:
        raise ValueError(
            f"Storage '{storage.root}' exists in another instance ({storage_instance_uid}), cannot write to it from here."
        )


class Settings:
    """Settings.

    Please use the global `ln.settings` object instead of instantiating this class yourself.
    """

    def __init__(self):
        self._verbosity_int: int = logger._verbosity
        self._sync_git_repo: str | None = None

    def __repr__(self) -> str:  # pragma: no cover
        if "sphinx" in sys.modules:
            return object.__repr__(self)

        cls_name = colors.green(self.__class__.__name__)
        verbosity_color = colors.yellow if self.verbosity == "warning" else colors.green
        verbosity_str = verbosity_color(self.verbosity)

        storage_root = self._storage_settings.root_as_str
        storage_str = colors.italic(storage_root)

        instance_str = colors.italic(self.instance_uid)
        track_color = colors.green if self.track_run_inputs else colors.yellow
        track_str = track_color(str(self.track_run_inputs))

        lines = [
            f"{cls_name}",
            f"  instance: {instance_str}",
            f"  storage: {storage_str}",
            f"  verbosity: {verbosity_str}",
            f"  track_run_inputs: {track_str}",
        ]

        if self.sync_git_repo:
            repo_name = (
                self.sync_git_repo.split("/")[-1]
                if "/" in self.sync_git_repo
                else self.sync_git_repo
            )
            lines.append(f"  sync_git_repo: {colors.italic(repo_name)}")

        return "\n".join(lines)

    @property
    def creation(self) -> CreationSettings:
        """SQLRecord creation settings.

        For example, `ln.settings.creation.search_names = False` will disable
        searching for records with similar names during creation.
        """
        return creation_settings

    @property
    def annotation(self) -> AnnotationSettings:
        """Artifact annotation settings.

        For example, `ln.settings.creation.search_names = False` will disable
        searching for records with similar names during creation.
        """
        return annotation_settings

    # note: this setting should probably be deprecated soon
    # warnings could then be filtered with a regular warning mechanism
    track_run_inputs: bool = True
    """Track run inputs (default `True`).

    If this setting is true, an artifact is recorded as run input upon `.load()`, `.cache()` & `.open()` provided :func:`~lamindb.track` was called in the current compute (Python, R) session.
    If :func:`~lamindb.track` was not called, you receive a warning message upon `.load()`, `.cache()` & `.open()`.

    If you switch this setting to `False`, you won't see the warning message anymore and no run inputs will be recorded.

    FAQ: :doc:`/faq/track-run-inputs`
    """
    __using_key: str | None = None
    _using_storage: str | None = None

    @property
    def _using_key(self) -> str | None:
        """Key for Django database settings."""
        return self.__using_key

    @_using_key.setter
    def _using_key(self, value: str | None):
        ln_setup.settings._using_key = value
        self.__using_key = value

    @property
    def _storage_settings(self) -> ln_setup.core.StorageSettings:
        if self._using_storage is None:
            storage_settings = ln_setup.settings.storage
        else:
            storage_settings = ln_setup.core.StorageSettings(root=self._using_storage)
        return storage_settings

    @property
    def sync_git_repo(self) -> str | None:
        """Sync transforms with scripts in git repository.

        If set, scripts will be synced with the specified git repository.

        Example::

            ln.settings.sync_git_repo = https://github.com/laminlabs/schmidt22

        You can also pass the git repo URL via the environment variable `LAMINDB_SYNC_GIT_REPO`::

            export LAMINDB_SYNC_GIT_REPO=https://github.com/laminlabs/schmidt22

        You'll then see::

            ln.settings.sync_git_repo
            #> 'https://github.com/laminlabs/schmidt22'

        """
        if self._sync_git_repo is not None:
            return self._sync_git_repo
        elif os.environ.get("LAMINDB_SYNC_GIT_REPO") is not None:
            return sanitize_git_repo_url(os.environ["LAMINDB_SYNC_GIT_REPO"])
        else:
            return setup_settings.instance.git_repo

    @sync_git_repo.setter
    def sync_git_repo(self, value) -> None:
        self._sync_git_repo = sanitize_git_repo_url(value)
        if not self._sync_git_repo.startswith("https://"):  # pragma: nocover
            raise ValueError("git repository URL must start with 'https://'.")

    @property
    def storage(self) -> StorageSettings:
        """Current default storage location for writes.

        Examples:

        Retrieve the storage settings::

            ln.settings.storage
            #> StorageSettings(root='s3://my-bucket')

        Retrieve the storage root::

            ln.settings.storage.root
            #> UPath('s3://my-bucket')

        Switch the current default storage location::

            ln.settings.storage = "s3://some-bucket"

        Pass additional `fsspec` `kwargs` via::

            kwargs = dict(
                profile="some_profile", # fsspec arg
                cache_regions=True # fsspec arg for s3
            )
            ln.settings.storage = "s3://some-bucket", kwargs
        """
        return self._storage_settings

    @storage.setter
    def storage(self, path_kwargs: AnyPathStr | tuple[AnyPathStr, Mapping]):
        from ..models import Storage

        if isinstance(path_kwargs, tuple):
            path, kwargs = path_kwargs
            if isinstance(kwargs, str):
                kwargs = {"host": kwargs}
        else:
            path, kwargs = path_kwargs, {}
        root_as_str = convert_root_path_to_str(path)
        exists = Storage.filter(root=root_as_str).one_or_none()
        if exists is None:
            response = input(
                f"Storage location {root_as_str} does not yet exist in the current instance. Do you want to continue with creating it? (y/n) "
            )
            # logger.warning(f"deprecated call because storage location does **not yet** exist; please create through ln.Storage(root={path}).save()")
            if response != "y":
                return None
            set_managed_storage(path, **kwargs)
        else:
            raise_if_storage_managed_by_other_instance(exists)
            ssettings = StorageSettings(
                root=exists.root,
                region=exists.region,
                uid=exists.uid,
                instance_id=ln_setup.settings.instance._id,
            )
            ln_setup.settings.instance._storage = ssettings
            kwargs.pop("host", None)  # host is not needed for existing storage
            settings.storage._set_fs_kwargs(**kwargs)

    @property
    def instance_uid(self) -> str:
        """The `uid` of the current instance."""
        return ln_setup.settings.instance.uid

    @property
    def cache_dir(self) -> UPath:
        """Cache root, a local directory to cache cloud files."""
        return ln_setup.settings.cache_dir

    @property
    def local_storage(self) -> StorageSettings:
        """An additional local default storage (a path to its root).

        Is only available if :attr:`~lamindb.setup.core.InstanceSettings.keep_artifacts_local` is enabled.

        Guide: :doc:`faq/keep-artifacts-local`
        """
        return ln_setup.settings.instance.local_storage

    @local_storage.setter
    def local_storage(self, local_root: Path | str):
        import lamindb as ln

        # note duplication with storage setter!
        ssettings = StorageSettings(root=local_root)
        exists = ln.Storage.filter(root=ssettings.root_as_str).one_or_none()
        if exists is None:
            response = input(
                f"Storage location {ssettings.root_as_str} does not yet exist. Do you want to continue with creating it? (y/n) "
            )
            # logger.warning(f"deprecated call because storage location does **not yet** exist; going forward, please create through ln.Storage(root={path}).save() going forward")
            if response != "y":
                return None
        else:
            raise_if_storage_managed_by_other_instance(exists)
        ln_setup.settings.instance.local_storage = local_root

    @property
    def verbosity(self) -> str:
        """Logger verbosity (default `'warning'`).

        - `'error'`: only show error messages
        - `'warning'`: also show warning messages
        - `'success'`: also show success and save messages
        - `'info'`: also show info messages
        - `'hint'`: also show hint messages
        - `'debug'`: also show detailed debug messages
        """
        return VERBOSITY_TO_STR[self._verbosity_int]

    @verbosity.setter
    def verbosity(self, verbosity: str | int):
        if isinstance(verbosity, str):
            verbosity_int = VERBOSITY_TO_INT[verbosity]
        else:
            verbosity_int = verbosity
        self._verbosity_int = verbosity_int
        logger.set_verbosity(verbosity_int)


settings = Settings()


================================================
FILE: lamindb/core/_sync_git.py
================================================
from __future__ import annotations

import subprocess
from pathlib import Path

from lamin_utils import logger
from lamindb_setup import settings as setup_settings
from lamindb_setup.core.hashing import hash_code

from ..core._settings import sanitize_git_repo_url, settings
from ..errors import BlobHashNotFound


def get_git_repo_from_remote(url: str | None = None, depth: int | None = 10) -> Path:
    """Clone the git repository if not already cloned.

    If `depth` is provided, a shallow clone is performed and no tags are fetched.
    """
    repo_url = url or settings.sync_git_repo
    repo_dir = setup_settings.cache_dir / repo_url.split("/")[-1]
    if repo_dir.exists():
        logger.debug(f"git repo {repo_dir} already exists locally")
        return repo_dir
    logger.important(
        f"running outside of synched git repo, cloning {repo_url} into {repo_dir}"
    )
    args = ["git", "clone", f"{repo_url}.git"]
    if depth is not None:
        # if depth is provided, will not fetch tags
        args += ["--depth", f"{depth}"]
    result = subprocess.run(
        args,
        capture_output=True,
        cwd=setup_settings.cache_dir,
    )
    if result.returncode != 0 or not repo_dir.exists():
        raise RuntimeError(result.stderr.decode())
    return repo_dir


def check_local_git_repo() -> bool:
    result = subprocess.run(
        ["git", "config", "--get", "remote.origin.url"],
        capture_output=True,
    )
    result_str = result.stdout.decode().strip()
    if result_str == "":
        # running-not-in-a-git-repo
        return False
    else:
        remote_url = sanitize_git_repo_url(result_str)
        if remote_url == settings.sync_git_repo:
            # running-in-correct-git-repo
            return True
        else:
            logger.warning(
                f"running in git repo: {remote_url}, expected: {settings.sync_git_repo}"
            )
            return False


def get_git_commit_hash(blob_hash: str, repo_dir: Path | None = None) -> str | None:
    # Fetch all remote branches so that we can also search them
    fetch_command = ["git", "fetch", "origin", "+refs/heads/*:refs/remotes/origin/*"]
    subprocess.run(fetch_command, cwd=repo_dir, check=True)

    # Find the commit containing the blob hash in all branches
    command = [
        "git",
        "log",
        "--all",
        f"--find-object={blob_hash}",
        "--pretty=format:%H",
    ]
    result = subprocess.run(
        command,
        capture_output=True,
        cwd=repo_dir,
    )
    # We just care to find one commit
    # Hence, we split by new line ("\n") and use the first one
    commit_hash = result.stdout.decode().split("\n")[0]

    if not commit_hash or result.returncode == 1:
        return None

    default_branch = (
        subprocess.run(
            ["git", "rev-parse", "--abbrev-ref", "origin/HEAD"],
            capture_output=True,
            cwd=repo_dir,
            text=True,
        )
        .stdout.strip()
        .split("/")[-1]
    )

    # Find all branches containing the commit
    commit_containing_branches = subprocess.run(
        ["git", "branch", "--all", "--contains", commit_hash],
        capture_output=True,
        cwd=repo_dir,
        text=True,
    ).stdout.split("\n")

    # Clean up branch names and filter out the default branch
    commit_containing_branches = [
        branch.strip().replace("remotes/", "")
        for branch in commit_containing_branches
        if branch.strip()
    ]
    non_default_branches = [
        branch for branch in commit_containing_branches if default_branch not in branch
    ]

    if non_default_branches:
        logger.warning(
            f"code blob hash {blob_hash} was found in non-default branch(es): {', '.join(non_default_branches)}"
        )

    assert (  # noqa: S101
        len(commit_hash) == 40
    ), f"commit hash |{commit_hash}| is not 40 characters long"

    return commit_hash


def get_filepath_within_git_repo(
    commit_hash: str, blob_hash: str, repo_dir: Path | None
) -> str:
    # repo_dir might not point to the root of the
    # the git repository because git log --find-object works
    # from anywhere in the repo, hence, let's get the root
    repo_root = (
        subprocess.run(
            ["git", "rev-parse", "--show-toplevel"],
            capture_output=True,
            cwd=repo_dir,
        )
        .stdout.decode()
        .strip()
    )
    # Run the git commands separately to circumvent spawning a shell
    git_command = ["git", "ls-tree", "-r", commit_hash]
    git_process = subprocess.Popen(
        git_command,
        stdout=subprocess.PIPE,
        cwd=repo_root,
    )

    grep_command = ["grep", "-E", blob_hash]
    result = subprocess.run(
        grep_command,
        stdin=git_process.stdout,
        capture_output=True,
        cwd=repo_root,
    )

    # Close the stdout to allow git_process to receive a SIGPIPE if grep_command exits
    git_process.stdout.close()
    git_process.wait()

    command = " ".join(git_command) + " | " + " ".join(grep_command)
    if result.returncode != 0 and result.stderr.decode() != "":
        raise RuntimeError(f"{command}\n{result.stderr.decode()}")
    if len(result.stdout.decode()) == 0:
        raise RuntimeError(
            f"Could not find path in git repo {settings.sync_git_repo} running:\n{command}"
            f"\nin local clone: {repo_root}"
        )
    filepath = result.stdout.decode().split()[-1]
    return filepath


def get_transform_reference_from_git_repo(path: Path) -> str:
    blob_hash = hash_code(path).hexdigest()
    commit_hash = None
    if check_local_git_repo():
        repo_dir = None
    else:
        repo_dir = get_git_repo_from_remote()
    commit_hash = get_git_commit_hash(blob_hash, repo_dir=repo_dir)
    if commit_hash is None:
        if repo_dir is None:
            repo_dir = Path.cwd()
        raise BlobHashNotFound(
            f"❌ Did not find blob hash {blob_hash} in git repo: {settings.sync_git_repo}\n"
            f"Did you commit & push the script to the remote repo? -> {path}"
        )
    gitpath = get_filepath_within_git_repo(commit_hash, blob_hash, repo_dir)
    reference = f"{settings.sync_git_repo}/blob/{commit_hash}/{gitpath}"
    return reference


def get_and_validate_git_metadata(
    url: str,
    path: str,
    version: str | None = None,
    branch: str | None = None,
) -> tuple[str, str]:
    """Get metadata from a git repository.

    Args:
        url: Git repository URL (e.g., "https://github.com/user/repo")
        path: Path to the main script within the repository
        version: Optional version/tag to checkout
        branch: Optional branch name (defaults to repository's default branch)

    Returns:
        Dictionary containing:
            - commit_hash: The current commit hash
            - url: The repository URL
            - main_script: Path to the main script
            - revision: The version/tag (if provided)
            - branch: The branch name

    Raises:
        RuntimeError: If git operations fail
        FileNotFoundError: If the specified path does not exist in the repository
    """
    url = sanitize_git_repo_url(url)
    repo_dir = get_git_repo_from_remote(url, depth=None)

    # Determine the branch to use
    if branch is None:
        # Get the default branch if not specified
        result_str = subprocess.run(
            ["git", "rev-parse", "--abbrev-ref", "origin/HEAD"],
            capture_output=True,
            cwd=repo_dir,
            text=True,
        )
        if result_str.returncode == 0:
            branch = result_str.stdout.strip().split("/")[-1]
        else:
            branch = "main"  # fallback to main

    # Fetch the latest changes
    subprocess.run(
        ["git", "fetch", "origin"],
        capture_output=True,
        cwd=repo_dir,
        check=True,
    )

    # Checkout the specified version or branch
    if version is not None:
        # Version takes precedence - checkout the tag/version
        result = subprocess.run(
            ["git", "checkout", version],
            capture_output=True,
            cwd=repo_dir,
        )
        if result.returncode != 0:
            raise ValueError(
                f"Failed to checkout version {version}: {result.stderr.decode()}"
            )
        logger.info(f"checked out version {version}")
    else:
        # Checkout the branch
        result = subprocess.run(
            ["git", "checkout", f"origin/{branch}"],
            capture_output=True,
            cwd=repo_dir,
        )
        if result.returncode != 0:
            raise ValueError(
                f"Failed to checkout branch {branch}: {result.stderr.decode()}"
            )
        logger.info(f"checked out branch {branch}")

    # Get the current commit hash
    result_str = subprocess.run(
        ["git", "rev-parse", "HEAD"],
        capture_output=True,
        cwd=repo_dir,
        text=True,
    )
    if result_str.returncode != 0:
        raise RuntimeError(f"Failed to get commit hash: {result_str.stderr}")

    commit_hash = result_str.stdout.strip()

    assert (  # noqa: S101
        len(commit_hash) == 40
    ), f"commit hash |{commit_hash}| is not 40 characters long"

    # Verify that the path exists as a file in the repository
    file_path = repo_dir / path
    if not file_path.exists():
        raise FileNotFoundError(f"Path '{path}' does not exist in repository {url}")
    if not file_path.is_file():
        raise FileNotFoundError(
            f"Path '{path}' exists but is not a file in repository {url}"
        )
    return url, commit_hash


================================================
FILE: lamindb/core/_track_environment.py
================================================
from __future__ import annotations

import subprocess
import sys
from typing import TYPE_CHECKING

import lamindb_setup as ln_setup
from lamin_utils import logger

if TYPE_CHECKING:
    from lamindb.models import Run


def track_python_environment(run: Run) -> None:
    env_dir = ln_setup.settings.cache_dir / "environments" / f"run_{run.uid}"
    filepath = env_dir / "run_env_pip.txt"
    if not env_dir.exists():
        filepath.parent.mkdir(parents=True)
    # create a requirements.txt
    # we don't create a conda environment.yml mostly for its slowness
    try:
        with open(filepath, "w") as f:
            result = subprocess.run(
                [sys.executable, "-m", "pip", "freeze"],
                stdout=f,
            )
    except OSError as e:
        result = None
        logger.warning(f"could not run pip freeze with error {e}")
    if result is not None and result.returncode == 0:
        logger.info(f"tracked pip freeze > {str(filepath)}")


================================================
FILE: lamindb/core/exceptions.py
================================================
from ..errors import *  # noqa: F403 backward compat


================================================
FILE: lamindb/core/loaders.py
================================================
"""Loaders in :class:`lamindb.Artifact.load`.

.. autodata:: SUPPORTED_SUFFIXES
.. autofunction:: load_fcs
.. autofunction:: load_tsv
.. autofunction:: load_h5ad
.. autofunction:: load_h5mu
.. autofunction:: load_html
.. autofunction:: load_json
.. autofunction:: load_image
.. autofunction:: load_svg

"""

from __future__ import annotations

import builtins
import re
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, cast

from lamin_utils import logger
from lamindb_setup import settings as setup_settings
from lamindb_setup.core.upath import (
    create_path,
    extract_suffix_from_path,
    infer_filesystem,
)

if TYPE_CHECKING:
    from anndata import AnnData
    from lamindb_setup.types import AnyPathStr
    from mudata import MuData
    from pandas import DataFrame

    from lamindb.core.storage.types import ScverseDataStructures


is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)


# tested in lamin-usecases
def load_fcs(*args, **kwargs) -> AnnData:
    """Load an `.fcs` file to `AnnData`."""
    try:
        import readfcs
    except ImportError:  # pragma: no cover
        raise ImportError("Please install readfcs: pip install readfcs") from None
    return readfcs.read(*args, **kwargs)


# for types below note that local UPaths are subclasses of Path
# Path(UPath(...)) properly coerces local UPaths and throws an error for cloud UPaths


def load_csv(path: Path | str, **kwargs) -> DataFrame:
    """Load `.csv` file to `DataFrame`."""
    import pandas as pd

    path_sanitized = Path(path)
    return pd.read_csv(path_sanitized, **kwargs)


def load_parquet(path: Path | str, **kwargs) -> DataFrame:
    """Load `.parquet` file to `DataFrame`."""
    import pandas as pd

    path_sanitized = Path(path)
    return pd.read_parquet(path_sanitized, **kwargs)


def load_tsv(path: Path | str, **kwargs) -> DataFrame:
    """Load `.tsv` file to `DataFrame`."""
    import pandas as pd

    path_sanitized = Path(path)
    return pd.read_csv(path_sanitized, sep="\t", **kwargs)


def load_h5ad(filepath: AnyPathStr, **kwargs) -> AnnData:
    """Load an `.h5ad` file to `AnnData`."""
    from anndata import read_h5ad

    fs, filepath_str = infer_filesystem(filepath)
    compression = kwargs.pop("compression", "infer")
    with fs.open(filepath_str, mode="rb", compression=compression) as file:
        adata = read_h5ad(file, backed=False, **kwargs)
        return adata


def load_h5mu(filepath: Path | str, **kwargs) -> MuData:
    """Load an `.h5mu` file to `MuData`."""
    import mudata as md

    path_sanitized = Path(filepath)
    return md.read_h5mu(path_sanitized, **kwargs)


def load_zarr(storepath, **kwargs):  # type: ignore
    try:
        from ..core.storage._zarr import load_zarr as _load_zarr
    except ImportError:
        raise ImportError("Please install zarr: pip install 'lamindb[zarr]'") from None
    return _load_zarr(storepath, **kwargs)


def load_html(path: Path | str) -> None | Path | str:
    """Display `.html` in ipython, otherwise return path."""
    if is_run_from_ipython:
        path_sanitized = Path(path)
        with path_sanitized.open(encoding="utf-8") as f:
            html_content = f.read()
        # Extract the body content using regular expressions
        body_content = re.findall(
            r"<body(?:.*?)>(?:.*?)</body>", html_content, re.DOTALL
        )
        # Remove any empty body tags
        if body_content:
            body_content = body_content[0]
            body_content = body_content.strip()  # type: ignore
        from IPython.display import HTML, display

        display(HTML(data=body_content))
        return None
    else:
        return path


def load_json(path: Path | str) -> dict[str, Any] | list[Any]:
    """Load `.json` to `dict`."""
    import json

    path_sanitized = Path(path)
    with path_sanitized.open(encoding="utf-8") as f:
        data = json.load(f)
    return data


def load_yaml(path: Path | str) -> dict[str, Any] | list[Any]:
    """Load `.yaml` to `dict`."""
    import yaml  # type: ignore

    path_sanitized = Path(path)
    with path_sanitized.open(encoding="utf-8") as f:
        data = yaml.safe_load(f)
    return data


def load_image(path: Path | str) -> None | Path | str:
    """Display `.jpg`, `.gif` or `.png` in ipython, otherwise return path."""
    if is_run_from_ipython:
        from IPython.display import Image, display

        path_sanitized = Path(path)
        display(Image(filename=path_sanitized.as_posix()))
        return None
    else:
        return path


def load_svg(path: Path | str) -> None | Path | str:
    """Display `.svg` in ipython, otherwise return path."""
    if is_run_from_ipython:
        from IPython.display import SVG, display

        path_sanitized = Path(path)
        display(SVG(filename=path_sanitized.as_posix()))
        return None
    else:
        return path


def load_txt(path: Path | str) -> str:
    """Load `.txt` file to `str`."""
    path_sanitized = Path(path)
    return path_sanitized.read_text(encoding="utf-8")


def load_rds(path: Path | str) -> Path | str:
    """Just warn when trying to load `.rds`."""
    logger.warning("Please use `laminr` to load `.rds` files")
    return path


FILE_LOADERS = {
    ".csv": load_csv,
    ".csv.gz": load_csv,
    ".csv.tar.gz": load_csv,
    ".tsv": load_tsv,
    ".tsv.gz": load_tsv,
    ".tsv.tar.gz": load_tsv,
    ".h5ad": load_h5ad,
    ".h5ad.gz": load_h5ad,
    ".h5ad.tar.gz": load_h5ad,
    ".parquet": load_parquet,
    ".fcs": load_fcs,
    ".zarr": load_zarr,
    ".anndata.zarr": load_zarr,
    ".html": load_html,
    ".json": load_json,
    ".vitessce.json": load_json,
    ".yaml": load_yaml,
    ".h5mu": load_h5mu,
    ".gif": load_image,
    ".jpg": load_image,
    ".png": load_image,
    ".svg": load_svg,
    ".rds": load_rds,
    ".txt": load_txt,
    ".fasta": load_txt,
}

SUPPORTED_SUFFIXES = [sfx for sfx in FILE_LOADERS.keys() if sfx != ".rds"]
"""Suffixes with defined artifact loaders."""


def load_to_memory(
    filepath: AnyPathStr, **kwargs
) -> DataFrame | ScverseDataStructures | dict[str, Any] | list[Any] | AnyPathStr | None:
    """Load a file into memory.

    Returns the filepath if no in-memory form is found.
    May return None in interactive sessions for images.
    """
    filepath = create_path(filepath)
    suffix = extract_suffix_from_path(filepath)
    loader = FILE_LOADERS.get(suffix, None)
    if loader is None:
        raise NotImplementedError(
            f"There is no loader for {suffix} files. Use .cache() to get the path."
        )

    filepath = setup_settings.paths.cloud_to_local(filepath, print_progress=True)

    return cast(Callable[..., Any], loader)(filepath, **kwargs)


================================================
FILE: lamindb/core/storage/__init__.py
================================================
"""Storage API.

Valid suffixes.

.. autodata:: VALID_SUFFIXES

Array accessors.

.. autoclass:: AnnDataAccessor
.. autoclass:: SpatialDataAccessor
.. autoclass:: BackedAccessor
"""

from typing import TYPE_CHECKING, Any

from lamindb_setup.core.upath import LocalPathClasses, UPath, infer_filesystem

from ._valid_suffixes import VALID_SUFFIXES
from .paths import delete_storage

if TYPE_CHECKING:
    from ._anndata_accessor import AnnDataAccessor
    from ._backed_access import BackedAccessor
    from ._spatialdata_accessor import SpatialDataAccessor
    from ._tiledbsoma import save_tiledbsoma_experiment
    from .objects import infer_suffix, write_to_disk


__all__ = [
    "AnnDataAccessor",
    "BackedAccessor",
    "LocalPathClasses",
    "SpatialDataAccessor",
    "UPath",
    "VALID_SUFFIXES",
    "delete_storage",
    "infer_filesystem",
    "infer_suffix",
    "save_tiledbsoma_experiment",
    "write_to_disk",
]

_LAZY_EXPORTS = frozenset(
    {
        "AnnDataAccessor",
        "BackedAccessor",
        "SpatialDataAccessor",
        "infer_suffix",
        "save_tiledbsoma_experiment",
        "write_to_disk",
    }
)


def __getattr__(name: str) -> Any:
    if name not in _LAZY_EXPORTS:
        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

    attr: Any
    if name == "AnnDataAccessor":
        from ._anndata_accessor import AnnDataAccessor as attr
    elif name == "BackedAccessor":
        from ._backed_access import BackedAccessor as attr
    elif name == "SpatialDataAccessor":
        from ._spatialdata_accessor import SpatialDataAccessor as attr
    elif name == "save_tiledbsoma_experiment":
        from ._tiledbsoma import save_tiledbsoma_experiment as attr
    else:
        from .objects import infer_suffix, write_to_disk

        attr = infer_suffix if name == "infer_suffix" else write_to_disk

    globals()[name] = attr
    return attr


================================================
FILE: lamindb/core/storage/_anndata_accessor.py
================================================
from __future__ import annotations

import inspect
from functools import cached_property
from importlib.metadata import version as get_version
from itertools import chain
from typing import TYPE_CHECKING, Callable, Literal, Union

import h5py
import numpy as np
import pandas as pd
from anndata import AnnData
from anndata._core.index import _normalize_indices
from anndata._core.views import _resolve_idx
from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
from anndata._io.specs.registry import (
    get_spec,
    read_elem,
    read_elem_partial,
    write_elem,
)
from anndata.compat import _read_attr
from fsspec.implementations.local import LocalFileSystem
from fsspec.utils import infer_compression
from lamin_utils import logger
from lamindb_setup.core.upath import S3FSMap, infer_filesystem
from packaging import version
from upath import UPath

if TYPE_CHECKING:
    from collections.abc import Mapping

    from fsspec.core import OpenFile
    from lamindb_setup.types import AnyPathStr

    from lamindb import Artifact

anndata_version_parse = version.parse(get_version("anndata"))

if anndata_version_parse < version.parse("0.9.0"):
    from anndata._core.index import Index
else:
    from anndata.compat import Index

if anndata_version_parse < version.parse("0.10.0"):
    if anndata_version_parse < version.parse("0.9.1"):
        logger.warning(
            "Full backed capabilities are not available for this version of anndata,"
            " please install anndata>=0.9.1."
        )

    from anndata._core.sparse_dataset import SparseDataset

    # try csr for groups with no encoding_type
    class CSRDataset(SparseDataset):
        @property
        def format_str(self) -> str:
            return "csr"

    def sparse_dataset(group):
        return SparseDataset(group)

else:
    if anndata_version_parse >= version.parse("0.11.0"):
        from anndata._core.sparse_dataset import (  # type: ignore
            _CSRDataset as CSRDataset,
        )
    else:
        from anndata._core.sparse_dataset import CSRDataset  # type: ignore
    from anndata._core.sparse_dataset import (
        BaseCompressedSparseDataset as SparseDataset,
    )
    from anndata._core.sparse_dataset import sparse_dataset  # type: ignore

    def _check_group_format(*args):
        pass

    CSRDataset._check_group_format = _check_group_format


# zarr and CSRDataset have problems with full selection
def _subset_sparse(sparse_ds: CSRDataset | SparseDataset, indices):
    has_arrays = isinstance(indices[0], np.ndarray) or isinstance(
        indices[1], np.ndarray
    )
    if not has_arrays and indices == (slice(None), slice(None)):
        return sparse_ds.to_memory()
    else:
        return sparse_ds[indices]


def get_module_name(obj):
    return inspect.getmodule(obj).__name__.partition(".")[0]


def _records_to_df(obj):
    if isinstance(obj, pd.DataFrame):
        return obj

    if hasattr(obj, "dtype") and obj.dtype.names is not None:
        formats = []
        for name, (dt, _) in obj.dtype.fields.items():
            if dt.char == "S":
                new_dt = str(dt).replace("S", "U")
            else:
                new_dt = dt
            formats.append((name, new_dt))
        df = pd.DataFrame(obj.astype(formats, copy=False))
        for index_name in ("index", "_index"):
            if index_name in df.columns:
                return df.set_index(index_name)
            return df
    else:
        return obj


class AccessRegistry:
    def __init__(self):
        self._registry = {}
        self._openers = {}

    def register_open(self, module: str):
        def wrapper(func: Callable):
            self._openers[module] = func
            return func

        return wrapper

    def open(self, module: str, *args, **kwargs):
        if module in self._openers:
            return self._openers[module](*args, **kwargs)
        else:
            raise ValueError(f"Module {module} not found, please install it.")

    def register(self, module: str):
        def wrapper(func: Callable):
            func_name = func.__name__
            if func_name not in self._registry:
                self._registry[func_name] = {}
            self._registry[func_name][module] = func
            return func

        return wrapper

    def __getattr__(self, func_name: str):
        def wrapper(*args, **kwargs):
            func_registry = self._registry[func_name]
            for arg in chain(args, kwargs.values()):
                arg_module = get_module_name(arg)
                if arg_module in func_registry:
                    return func_registry[arg_module](*args, **kwargs)
            raise ValueError(f"{func_name} is not registered for this module.")

        return wrapper


# storage specific functions should be registered and called through the registry
registry = AccessRegistry()


@registry.register_open("h5py")
def open(filepath: AnyPathStr, mode: str = "r", compression: str | None = "infer"):
    fs, file_path_str = infer_filesystem(filepath)
    # we don't open compressed files directly because we need fsspec to uncompress on .open
    compression = (
        infer_compression(file_path_str) if compression == "infer" else compression
    )
    if isinstance(fs, LocalFileSystem) and compression is None:
        assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!"  #  noqa: S101
        return None, h5py.File(file_path_str, mode=mode)
    if mode == "r":
        conn_mode = "rb"
    elif mode == "w":
        conn_mode = "wb"
    elif mode == "a":
        conn_mode = "ab"
    else:
        raise ValueError(f"Unknown mode {mode}! Should be 'r', 'w' or 'a'.")
    conn = fs.open(file_path_str, mode=conn_mode, compression=compression)
    try:
        storage = h5py.File(conn, mode=mode)
    except Exception as e:
        conn.close()
        raise e
    return conn, storage


@registry.register("h5py")
def read_dataframe(elem: h5py.Dataset | h5py.Group):
    if isinstance(elem, h5py.Dataset):
        return read_dataframe_legacy_h5(elem)
    else:
        return read_elem(elem)


@registry.register("h5py")
def safer_read_partial(elem, indices):
    is_dataset = isinstance(elem, h5py.Dataset)
    indices_inverse: list | None = None
    encoding_type = get_spec(elem).encoding_type
    # h5py selection for datasets requires sorted indices
    if is_dataset or encoding_type == "dataframe":
        indices_increasing = []
        indices_inverse = []
        for indices_dim in indices:
            # should be integer or bool
            # ignore bool or increasing unique integers
            if (
                isinstance(indices_dim, np.ndarray)
                and indices_dim.dtype != "bool"
                and not np.all(np.diff(indices_dim) > 0)
            ):
                idx_unique, idx_inverse = np.unique(indices_dim, return_inverse=True)
                indices_increasing.append(idx_unique)
                indices_inverse.append(idx_inverse)
            else:
                indices_increasing.append(indices_dim)
                indices_inverse.append(None)
        indices = tuple(indices_increasing)
        if all(idx is None for idx in indices_inverse):
            indices_inverse = None
    result = None
    if encoding_type == "":
        if is_dataset:
            dims = len(elem.shape)
            if dims == 2:
                result = elem[indices]
            elif dims == 1:
                if indices[0] == slice(None):
                    result = elem[indices[1]]
                elif indices[1] == slice(None):
                    result = elem[indices[0]]
        elif isinstance(elem, h5py.Group):
            try:
                ds = CSRDataset(elem)
                result = _subset_sparse(ds, indices)
            except Exception as e:
                logger.debug(
                    f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
                )
        if result is None:
            raise ValueError(
                "Can not get a subset of the element of type"
                f" {type(elem).__name__} with an empty spec."
            )
    else:
        result = read_elem_partial(elem, indices=indices)
    if indices_inverse is None:
        return result
    else:
        if indices_inverse[0] is None:
            if len(result.shape) == 2:
                return result[:, indices_inverse[1]]
            else:
                return result[indices_inverse[1]]
        elif indices_inverse[1] is None:
            if isinstance(result, pd.DataFrame):
                return result.iloc[indices_inverse[0]]
            else:
                return result[indices_inverse[0]]
        else:
            return result[tuple(indices_inverse)]


@registry.register("h5py")
def keys(storage: h5py.File):
    attrs_keys: dict[str, list] = {}
    for attr in storage.keys():
        if attr == "X":
            continue
        attr_obj = storage[attr]
        if attr in ("obs", "var") and isinstance(attr_obj, h5py.Dataset):
            keys = list(attr_obj.dtype.fields.keys())
        else:
            keys = list(attr_obj.keys())
        if len(keys) > 0:
            attrs_keys[attr] = keys
    return attrs_keys


ArrayTypes = [h5py.Dataset]
GroupTypes = [h5py.Group]
StorageTypes = [h5py.File]


ZARR_INSTALLED = False
try:
    import zarr

    ZARR_INSTALLED = True
except ImportError:
    pass

if ZARR_INSTALLED:
    from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr

    from ._zarr import IS_ZARR_V3, get_zarr_store

    ArrayTypes.append(zarr.Array)
    GroupTypes.append(zarr.Group)
    StorageTypes.append(zarr.Group)

    @registry.register_open("zarr")
    def open(filepath: AnyPathStr, mode: Literal["r", "r+", "a", "w", "w-"] = "r"):
        assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!"  #  noqa: S101

        store = get_zarr_store(filepath)
        kwargs = {}
        if IS_ZARR_V3 and mode != "r":
            # otherwise unable to write
            kwargs["use_consolidated"] = False
        storage = zarr.open(store, mode=mode, **kwargs)
        # zarr v2 re-initializes the mapper
        # we need to put back the correct one
        # S3FSMap is returned from get_zarr_store only for zarr v2
        if isinstance(store, S3FSMap):
            assert not IS_ZARR_V3  # noqa: S101

            storage.store.map = store
        conn = None
        return conn, storage

    @registry.register("zarr")
    def read_dataframe(elem: Union[zarr.Array, zarr.Group]):  # noqa
        if isinstance(elem, zarr.Array):
            return read_dataframe_legacy_zarr(elem)
        else:
            return read_elem(elem)

    @registry.register("zarr")
    def safer_read_partial(elem, indices):
        encoding_type = get_spec(elem).encoding_type
        if encoding_type == "":
            if isinstance(elem, zarr.Array):
                dims = len(elem.shape)
                if dims == 2:
                    return elem.oindex[indices]
                elif dims == 1:
                    if indices[0] == slice(None):
                        return elem.oindex[indices[1]]
                    elif indices[1] == slice(None):
                        return elem.oindex[indices[0]]
            elif isinstance(elem, zarr.Group):
                try:
                    ds = CSRDataset(elem)
                    return _subset_sparse(ds, indices)
                except Exception as e:
                    logger.debug(
                        f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
                    )
            raise ValueError(
                "Can not get a subset of the element of type"
                f" {type(elem).__name__} with an empty spec."
            )
        else:
            if encoding_type in ("csr_matrix", "csc_matrix"):
                ds = sparse_dataset(elem)
                return _subset_sparse(ds, indices)
            else:
                indices = tuple(
                    idim.tolist()
                    if isinstance(idim, np.ndarray) and idim.dtype == "bool"
                    else idim
                    for idim in indices
                )
                return read_elem_partial(elem, indices=indices)

    # this is needed because accessing zarr.Group.keys() directly is very slow
    @registry.register("zarr")
    def keys(storage: zarr.Group):
        if IS_ZARR_V3:
            paths = storage._sync_iter(storage.store.list())
        else:
            paths = storage.store.keys()

        attrs_keys: dict[str, list] = {}
        obs_var_arrays = []

        prefix = storage.path
        if prefix == "":
            paths_iter = (path for path in paths)
        else:
            prefix += "/"
            paths_iter = (
                path.removeprefix(prefix) for path in paths if path.startswith(prefix)
            )

        for path in paths_iter:
            if path in (".zattrs", ".zgroup"):
                continue
            parts = path.split("/")
            if len(parts) < 2:
                continue
            attr = parts[0]
            key = parts[1]

            if attr == "X":
                continue

            if attr in ("obs", "var"):
                if attr in obs_var_arrays:
                    continue
                if key == ".zarray":
                    attrs_keys.pop(attr, None)
                    obs_var_arrays.append(attr)

            if attr not in attrs_keys:
                attrs_keys[attr] = []

            if key in (".zattrs", ".zgroup", ".zarray"):
                continue
            attr_keys = attrs_keys[attr]
            if key not in attr_keys:
                attr_keys.append(key)

        for attr in obs_var_arrays:
            attrs_keys[attr] = list(storage[attr].dtype.fields.keys())

        return {attr: keys for attr, keys in attrs_keys.items() if len(keys) > 0}


ArrayTypes = tuple(ArrayTypes)  # type: ignore
GroupTypes = tuple(GroupTypes)  # type: ignore
StorageTypes = tuple(StorageTypes)  # type: ignore


ArrayType = Union[ArrayTypes]  # type: ignore
GroupType = Union[GroupTypes]  # type: ignore
StorageType = Union[StorageTypes]  # type: ignore


def _to_memory(elem):
    if isinstance(elem, ArrayTypes):
        return elem[()]
    elif isinstance(elem, SparseDataset):
        return elem.to_memory()
    else:
        return elem


def _try_backed_full(elem):
    # think what to do for compatibility with old var and obs
    if isinstance(elem, ArrayTypes):
        return elem

    if isinstance(elem, GroupTypes):
        encoding_type = get_spec(elem).encoding_type
        if encoding_type in ("csr_matrix", "csc_matrix"):
            return sparse_dataset(elem)
        if "h5sparse_format" in elem.attrs:
            return sparse_dataset(elem)
        if encoding_type == "" and "indptr" in elem:
            return CSRDataset(elem)

    return read_elem(elem)


def _to_index(elem: np.ndarray):
    if elem.dtype in (np.float64, np.int64):
        elem = elem.astype(str)
    return pd.Index(elem)


def _safer_read_index(elem):
    if isinstance(elem, GroupTypes):
        return _to_index(read_elem(elem[_read_attr(elem.attrs, "_index")]))
    elif isinstance(elem, ArrayTypes):
        indices = None
        for index_name in ("index", "_index"):
            if index_name in elem.dtype.names:
                indices = elem[index_name]
                break
        if indices is not None and len(indices) > 0:
            if isinstance(indices[0], bytes):
                indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices)
            return _to_index(indices)
        else:
            raise ValueError("Indices not found.")
    else:
        raise ValueError(f"Unknown elem type {type(elem)} when reading indices.")


class _MapAccessor:
    def __init__(self, elem, name, indices=None):
        self.elem = elem
        self.indices = indices
        self.name = name

    def __getitem__(self, key):
        if self.indices is None:
            return _try_backed_full(self.elem[key])
        else:
            return registry.safer_read_partial(self.elem[key], indices=self.indices)

    def keys(self):
        return list(self.elem.keys())

    def __repr__(self):
        """Description of the _MapAccessor object."""
        descr = f"Accessor for the AnnData attribute {self.name}"
        descr += f"\n  with keys: {self.keys()}"
        return descr


def _safer_read_df(elem, indices=None):
    if indices is not None:
        obj = registry.safer_read_partial(elem, indices=indices)
        df = _records_to_df(obj)
    else:
        df = registry.read_dataframe(elem)
    if df.index.dtype in (np.float64, np.int64):
        df.index = df.index.astype(str)
    return df


class _AnnDataAttrsMixin:
    storage: StorageType
    _attrs_keys: Mapping[str, list]

    @cached_property
    def obs(self) -> pd.DataFrame | None:
        if "obs" not in self._attrs_keys:
            return None
        indices = getattr(self, "indices", None)
        return _safer_read_df(
            self.storage["obs"],  # type: ignore
            indices=(indices[0], slice(None)) if indices is not None else None,
        )

    @cached_property
    def var(self) -> pd.DataFrame | None:
        if "var" not in self._attrs_keys:
            return None
        indices = getattr(self, "indices", None)
        return _safer_read_df(
            self.storage["var"],  # type: ignore
            indices=(indices[1], slice(None)) if indices is not None else None,
        )

    @cached_property
    def uns(self):
        if "uns" not in self._attrs_keys:
            return None
        return read_elem(self.storage["uns"])

    @cached_property
    def X(self):
        indices = getattr(self, "indices", None)
        if indices is not None:
            return registry.safer_read_partial(self.storage["X"], indices=indices)
        else:
            return _try_backed_full(self.storage["X"])

    @cached_property
    def obsm(self):
        if "obsm" not in self._attrs_keys:
            return None
        indices = getattr(self, "indices", None)
        if indices is not None:
            indices = (indices[0], slice(None))
        return _MapAccessor(self.storage["obsm"], "obsm", indices)

    @cached_property
    def varm(self):
        if "varm" not in self._attrs_keys:
            return None
        indices = getattr(self, "indices", None)
        if indices is not None:
            indices = (indices[1], slice(None))
        return _MapAccessor(self.storage["varm"], "varm", indices)

    @cached_property
    def obsp(self):
        if "obsp" not in self._attrs_keys:
            return None
        indices = getattr(self, "indices", None)
        if indices is not None:
            indices = (indices[0], indices[0])
        return _MapAccessor(self.storage["obsp"], "obsp", indices)

    @cached_property
    def varp(self):
        if "varp" not in self._attrs_keys:
            return None
        indices = getattr(self, "indices", None)
        if indices is not None:
            indices = (indices[1], indices[1])
        return _MapAccessor(self.storage["varp"], "varp", indices)

    @cached_property
    def layers(self):
        if "layers" not in self._attrs_keys:
            return None
        indices = getattr(self, "indices", None)
        return _MapAccessor(self.storage["layers"], "layers", indices)

    @property
    def obs_names(self):
        return self._obs_names

    @property
    def var_names(self):
        return self._var_names

    @cached_property
    def shape(self):
        return len(self._obs_names), len(self._var_names)

    def to_dict(self):
        prepare_adata = {}

        prepare_adata["X"] = _to_memory(self.X)

        if "uns" in self._attrs_keys:
            prepare_adata["uns"] = self.uns

        for attr in ("obs", "var"):
            if attr in self._attrs_keys:
                prepare_adata[attr] = getattr(self, attr)

        for attr in ("obsm", "varm", "obsp", "varp", "layers"):
            if attr in self._attrs_keys:
                prepare_adata[attr] = {}
                get_attr = getattr(self, attr)
                for key in self._attrs_keys[attr]:
                    prepare_adata[attr][key] = _to_memory(get_attr[key])

        if "raw" in self._attrs_keys:
            prepare_adata["raw"] = self.raw.to_dict()

        return prepare_adata

    def to_memory(self):
        adata = AnnData(**self.to_dict())
        return adata


class AnnDataAccessorSubset(_AnnDataAttrsMixin):
    def __init__(self, storage, indices, attrs_keys, obs_names, var_names, ref_shape):
        self.storage = storage
        self.indices = indices

        self._attrs_keys = attrs_keys
        self._obs_names, self._var_names = obs_names, var_names

        self._ref_shape = ref_shape

    def __getitem__(self, index: Index):
        """Access a subset of the underlying AnnData object."""
        oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
        new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
        if self.indices is not None:
            oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0])
            vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1])
        return type(self)(
            self.storage,
            (oidx, vidx),
            self._attrs_keys,
            new_obs_names,
            new_var_names,
            self._ref_shape,
        )

    def __repr__(self):
        """Description of the object."""
        n_obs, n_vars = self.shape
        descr = f"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}"
        for attr, keys in self._attrs_keys.items():
            descr += f"\n  {attr}: {keys}"
        return descr

    @cached_property
    def raw(self):
        if "raw" not in self._attrs_keys:
            return None
        prepare_indices = None
        if self.indices is not None:
            oidx = self.indices[0]
            if isinstance(oidx, np.ndarray) or oidx != slice(None):
                prepare_indices = oidx, slice(None)
        return AnnDataRawAccessor(
            self.storage["raw"],
            prepare_indices,
            None,
            self._obs_names,
            None,
            self._ref_shape[0],
        )


class AnnDataRawAccessor(AnnDataAccessorSubset):
    def __init__(
        self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
    ):
        var_raw = storage_raw["var"]

        if var_names is None:
            var_names = _safer_read_index(var_raw)

        if isinstance(ref_shape, int):
            ref_shape = ref_shape, len(var_names)
        elif isinstance(ref_shape, tuple) and len(ref_shape) < 2:
            ref_shape = ref_shape[0], len(var_names)

        if attrs_keys is None:
            attrs_keys = {}
            if isinstance(var_raw, ArrayTypes):
                attrs_keys["var"] = list(var_raw.dtype.fields.keys())
            else:
                # for some reason list(var_raw.keys()) is very slow for zarr
                # maybe also directly get keys from the underlying mapper
                attrs_keys["var"] = list(var_raw)
            if "varm" in storage_raw:
                varm_keys_raw = list(storage_raw["varm"])
                if len(varm_keys_raw) > 0:
                    attrs_keys["varm"] = varm_keys_raw

        super().__init__(
            storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
        )

    @property
    def raw(self):
        raise AttributeError


class AnnDataAccessor(_AnnDataAttrsMixin):
    """Cloud-backed AnnData."""

    def __init__(
        self,
        connection: OpenFile | None,
        storage: StorageType,
        filename: str,
        artifact: Artifact | None = None,
    ):
        self._conn = connection
        self.storage = storage

        self._attrs_keys = registry.keys(self.storage)

        self._name = filename

        self._obs_names = _safer_read_index(self.storage["obs"])  # type: ignore
        self._var_names = _safer_read_index(self.storage["var"])  # type: ignore

        self._artifact = artifact  # save artifact to update in write mode

        self._updated = False  # track updates in r+ mode for zarr

        self._entered = False  # check that the context manager is used
        self._closed = False

    def close(self):
        """Closes the connection."""
        storage = self.storage
        connection = self._conn

        if self._updated and (artifact := self._artifact) is not None:
            from lamindb.models.artifact import Artifact
            from lamindb.models.sqlrecord import init_self_from_db

            # now self._updated can only be True for zarr
            assert ZARR_INSTALLED  # noqa: S101

            store = storage.store
            keys = storage._sync_iter(store.list()) if IS_ZARR_V3 else store.keys()
            # this checks that there consolidated metadata was written before
            # need to update it
            # zmetadata is in spatialdata sometimes for some reason
            if ".zmetadata" in keys or "zmetadata" in keys:
                zarr.consolidate_metadata(store)

            new_version = Artifact(
                artifact.path, revises=artifact, _is_internal_call=True
            ).save()
            # note: sets _state.db = "default"
            init_self_from_db(artifact, new_version)

        if hasattr(storage, "close"):
            storage.close()
        if hasattr(connection, "close"):
            connection.close()
        self._closed = True

    @property
    def closed(self):
        return self._closed

    def __enter__(self):
        self._entered = True

        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def __getitem__(self, index: Index) -> AnnDataAccessorSubset:
        """Access a subset of the underlying AnnData object."""
        oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
        new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
        return AnnDataAccessorSubset(
            self.storage,
            (oidx, vidx),
            self._attrs_keys,
            new_obs_names,
            new_var_names,
            self.shape,
        )

    def __repr__(self):
        """Description of the AnnDataAccessor object."""
        n_obs, n_vars = self.shape
        descr = f"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}"
        descr += f"\n  constructed for the AnnData object {self._name}"
        for attr, keys in self._attrs_keys.items():
            descr += f"\n    {attr}: {keys}"
        return descr

    @cached_property
    def raw(self):
        if "raw" not in self._attrs_keys:
            return None
        return AnnDataRawAccessor(
            self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
        )

    def add_column(
        self,
        where: Literal["obs", "var"],
        col_name: str,
        col: np.ndarray | pd.Categorical,
    ):
        """Add a new column to .obs or .var of the underlying AnnData object."""
        df_store = self.storage[where]  # type: ignore
        if getattr(df_store, "read_only", True):
            raise ValueError(
                "You can use .add_column(...) only with zarr in a writable mode."
            )
        write_elem(df_store, col_name, col)
        df_store.attrs["column-order"] = df_store.attrs["column-order"] + [col_name]
        # remind only once if this wasn't updated before and not in the context manager
        if not self._updated and not self._entered and self._artifact is not None:
            logger.important(
                "Do not forget to call .close() after you finish "
                f"working with this accessor for {self._name} "
                "to automatically update the corresponding artifact."
            )

        self._updated = True
        # reset the cached property
        # todo: maybe just append the column if the df was already loaded
        self.__dict__.pop(where, None)
        # update the cached columns
        self._attrs_keys[where].append(col_name)


# get the number of observations in an anndata object or file fast and safely
def _anndata_n_observations(object: AnyPathStr | AnnData) -> int | None:
    if isinstance(object, AnnData):
        return object.n_obs

    try:
        objectpath = UPath(object)
        conn_module = None
        if ".h5ad" in objectpath.suffixes:
            conn_module = "h5py"
        elif objectpath.suffix == ".zarr":
            conn_module = "zarr"
        conn, storage = registry.open(conn_module, objectpath, mode="r")
    except Exception as e:
        logger.warning(f"Could not open {object} to read n_observations: {e}")
        return None

    n_observations: int | None = None
    try:
        obs = storage["obs"]
        if isinstance(obs, GroupTypes):  # type: ignore
            if "_index" in obs.attrs:
                elem_key = _read_attr(obs.attrs, "_index")
            else:
                elem_key = next(iter(obs))
            elem = obs[elem_key]
            if isinstance(elem, ArrayTypes):  # type: ignore
                n_observations = elem.shape[0]
            else:
                # assume standard obs group
                n_observations = elem["codes"].shape[0]
        else:
            n_observations = obs.shape[0]
    except Exception as e:
        logger.warning(f"Could not read n_observations from anndata {object}: {e}")
    finally:
        if hasattr(storage, "close"):
            storage.close()
        if hasattr(conn, "close"):
            conn.close()
    return n_observations


================================================
FILE: lamindb/core/storage/_backed_access.py
================================================
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Callable, Literal

PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc")
POLARS_SUFFIXES = (".parquet", ".csv", ".ndjson", ".ipc")

if TYPE_CHECKING:
    from collections.abc import Iterator

    from fsspec.core import OpenFile
    from polars import LazyFrame as PolarsLazyFrame
    from pyarrow.dataset import Dataset as PyArrowDataset
    from tiledbsoma import Collection as SOMACollection
    from tiledbsoma import Experiment as SOMAExperiment
    from tiledbsoma import Measurement as SOMAMeasurement
    from upath import UPath

    from lamindb.models.artifact import Artifact

    from ._anndata_accessor import AnnDataAccessor, StorageType
    from ._spatialdata_accessor import SpatialDataAccessor


# this dynamically creates a subclass of a context manager class
# and reassigns it to an instance of the superclass
# so that the instance calls finalize on close or exit
def _track_writes_factory(obj: Any, finalize: Callable):
    closed: bool = False

    tracked_class = obj.__class__
    type_dict = {"__doc__": tracked_class.__doc__}
    if hasattr(tracked_class, "__slots__"):
        type_dict["__slots__"] = ()
    if hasattr(tracked_class, "__exit__"):

        def __exit__(self, exc_type, exc_val, exc_tb):
            nonlocal closed
            tracked_class.__exit__(self, exc_type, exc_val, exc_tb)
            if not closed:
                finalize()
                closed = True

        type_dict["__exit__"] = __exit__
    if hasattr(tracked_class, "close"):

        def close(self, *args, **kwargs):
            nonlocal closed
            tracked_class.close(self, *args, **kwargs)
            if not closed:
                finalize()
                closed = True

        type_dict["close"] = close

    Track = type(tracked_class.__name__ + "Track", (tracked_class,), type_dict)
    obj.__class__ = Track
    return obj


@dataclass
class BackedAccessor:
    """h5py.File or zarr.Group accessor."""

    connection: OpenFile
    """The connection."""
    storage: StorageType
    """The storage access."""


def backed_access(
    artifact_or_filepath: Artifact | UPath,
    mode: str = "r",
    engine: Literal["pyarrow", "polars"] = "pyarrow",
    using_key: str | None = None,
    **kwargs,
) -> (
    AnnDataAccessor
    | SpatialDataAccessor
    | BackedAccessor
    | SOMACollection
    | SOMAExperiment
    | SOMAMeasurement
    | PyArrowDataset
    | Iterator[PolarsLazyFrame]
):
    from lamindb.models import Artifact

    from .paths import filepath_from_artifact

    if isinstance(artifact_or_filepath, Artifact):
        artifact = artifact_or_filepath
        objectpath, _ = filepath_from_artifact(artifact, using_key=using_key)
    else:
        artifact = None
        objectpath = artifact_or_filepath
    name = objectpath.name
    suffix = objectpath.suffix
    non_gz_suffix = _non_gz_suffix(objectpath.suffixes)

    if name == "soma" or suffix == ".tiledbsoma":
        if mode not in {"r", "w"}:
            raise ValueError("`mode` should be either 'r' or 'w' for tiledbsoma.")
        from ._tiledbsoma import _open_tiledbsoma

        return _open_tiledbsoma(objectpath, mode=mode, **kwargs)  # type: ignore
    elif non_gz_suffix in {".h5", ".hdf5", ".h5ad"}:
        from ._anndata_accessor import registry

        conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
    elif suffix == ".zarr":
        from ._anndata_accessor import registry

        if mode not in {"r", "r+"}:
            raise ValueError("`mode` should be either 'r' or 'r+' for zarr.")
        conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
        if "spatialdata_attrs" in storage.attrs:
            from ._spatialdata_accessor import SpatialDataAccessor

            return SpatialDataAccessor(storage, name, artifact)
    elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and (
        df_suffix := df_suffixes.pop()
    ) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES):
        return _open_dataframe(objectpath, df_suffix, engine, **kwargs)
    else:
        raise ValueError(
            "The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
            f"be compatible with pyarrow.dataset.dataset or polars.scan_* functions, "
            f"instead of being {suffix} object."
        )

    import h5py
    from anndata._io.specs.registry import get_spec

    from ._anndata_accessor import AnnDataAccessor

    is_anndata = (
        non_gz_suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
    )
    if is_anndata:
        if mode != "r" and isinstance(storage, h5py.Group):
            raise ValueError("Can only access `hdf5` `AnnData` with mode='r'.")
        return AnnDataAccessor(conn, storage, name, artifact)
    else:
        return BackedAccessor(conn, storage)


def _non_gz_suffix(suffixes: list[str]) -> str:
    len_suffixes = len(suffixes)
    if len_suffixes == 0:
        return ""
    if len_suffixes > 1 and ".gz" in suffixes:
        if (suffix := suffixes[-2]) != ".tar":
            return suffix
        elif len_suffixes > 2:
            return suffixes[-3]
    return suffixes[-1]


def _flat_suffixes(paths: UPath | list[UPath]) -> set[str]:
    # it is assumed here that the paths exist
    # we don't check here that the filesystem is the same
    # but this is a requirement for pyarrow.dataset.dataset
    path_list = []
    paths_list = paths if isinstance(paths, list) else [paths]
    for path in paths_list:
        # assume http is always a file
        if path.protocol not in {"http", "https"} and path.is_dir():
            path_list += [p for p in path.rglob("*") if p.suffix != ""]
        else:
            path_list.append(path)

    return {path.suffix for path in path_list}


def _open_dataframe(
    paths: UPath | list[UPath],
    suffix: str | None = None,
    engine: Literal["pyarrow", "polars"] = "pyarrow",
    **kwargs,
) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
    from ._polars_lazy_df import POLARS_SUFFIXES, _open_polars_lazy_df
    from ._pyarrow_dataset import PYARROW_SUFFIXES, _open_pyarrow_dataset

    if engine not in {"pyarrow", "polars"}:
        raise ValueError(
            f"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'."
        )

    df_suffix: str
    if suffix is None:
        df_suffixes = _flat_suffixes(paths)
        if len(df_suffixes) > 1:
            raise ValueError(
                f"The artifacts in the collection have different file formats: {', '.join(df_suffixes)}.\n"
                "It is not possible to open such stores with pyarrow or polars."
            )
        df_suffix = df_suffixes.pop()
    else:
        df_suffix = suffix

    if engine == "pyarrow" and df_suffix not in PYARROW_SUFFIXES:
        raise ValueError(
            f"{df_suffix} files are not supported by pyarrow, "
            f"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}."
        )
    elif engine == "polars" and df_suffix not in POLARS_SUFFIXES:
        raise ValueError(
            f"{df_suffix} files are not supported by polars, "
            f"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}."
        )

    polars_without_fsspec = engine == "polars" and not kwargs.get("use_fsspec", False)
    paths_list = paths if isinstance(paths, list) else [paths]
    if (engine == "pyarrow" or polars_without_fsspec) and len(paths_list) > 1:
        # this checks that the filesystem is the same for all paths
        # this is a requirement of pyarrow.dataset.dataset
        fs = paths_list[0].fs
        for path in paths_list[1:]:
            # this assumes that the filesystems are cached by fsspec
            if path.fs is not fs:
                engine_msg = (
                    "polars engine without passing `use_fsspec=True`"
                    if engine == "polars"
                    else "pyarrow engine"
                )
                raise ValueError(
                    "The collection has artifacts with different filesystems, "
                    f"this is not supported for {engine_msg}."
                )

    return (
        _open_pyarrow_dataset(paths, **kwargs)
        if engine == "pyarrow"
        else _open_polars_lazy_df(paths, **kwargs)
    )


================================================
FILE: lamindb/core/storage/_polars_lazy_df.py
================================================
from __future__ import annotations

from contextlib import contextmanager
from typing import TYPE_CHECKING

from lamindb_setup.core.upath import _ensure_sync_with_fs, get_storage_region

if TYPE_CHECKING:
    from collections.abc import Iterator

    from polars import LazyFrame as PolarsLazyFrame
    from upath import UPath

POLARS_SUFFIXES = (".parquet", ".csv", ".ndjson", ".ipc")


def _polars_options(storepath: UPath) -> dict:
    polars_options: dict = {}
    storage_options: dict[str, str | bool] = {}

    fs = storepath.fs
    fs.connect()

    endpoint_url = fs.endpoint_url
    if endpoint_url is not None:
        storage_options["aws_virtual_hosted_style_request"] = False
        storage_options["aws_endpoint_url"] = endpoint_url
        if endpoint_url.startswith("http://"):
            storage_options["aws_allow_http"] = True
    else:
        storage_options["aws_region"] = get_storage_region(storepath)

    if fs.anon:
        storage_options["aws_skip_signature"] = True
    else:
        aws_key = fs.key
        aws_secret = fs.secret
        aws_token = fs.token
        if aws_key is not None and aws_secret is not None:
            storage_options["aws_access_key_id"] = aws_key
            storage_options["aws_secret_access_key"] = aws_secret
            if aws_token is not None:
                storage_options["aws_session_token"] = aws_token
        else:
            from aiobotocore.credentials import AioRefreshableCredentials

            if isinstance(
                refreshable_credentials := fs.session._credentials,
                AioRefreshableCredentials,
            ):
                refresh_sync = _ensure_sync_with_fs(
                    refreshable_credentials._refresh, fs
                )

                def credential_provider_fn():
                    # refresh and access the credentials
                    refresh_sync()
                    expiry_time = refreshable_credentials._expiry_time
                    return {
                        "aws_access_key_id": refreshable_credentials._access_key,
                        "aws_secret_access_key": refreshable_credentials._secret_key,
                        "aws_session_token": refreshable_credentials._token,
                    }, int(expiry_time.timestamp()) if expiry_time is not None else None

                polars_options["credential_provider"] = credential_provider_fn

    polars_options["storage_options"] = storage_options

    return polars_options


@contextmanager
def _open_polars_lazy_df(
    paths: UPath | list[UPath], use_fsspec: bool = False, **kwargs
) -> Iterator[PolarsLazyFrame]:
    try:
        import polars as pl
    except ImportError as ie:
        raise ImportError("Please install polars: pip install polars") from ie

    scans = {
        ".parquet": pl.scan_parquet,
        ".csv": pl.scan_csv,
        ".ndjson": pl.scan_ndjson,
        ".ipc": pl.scan_ipc,
    }

    path_list = []
    paths_list = paths if isinstance(paths, list) else [paths]
    for path in paths_list:
        # assume http is always a file
        if path.protocol not in {"http", "https"} and path.is_dir():
            path_list += [p for p in path.rglob("*") if p.suffix != ""]
        else:
            path_list.append(path)
    # assume the filesystem is the same for all
    # it is checked in _open_dataframe
    path0 = path_list[0]
    if (
        not use_fsspec
        and path0.protocol == "s3"
        and "storage_options" not in kwargs
        and "credential_provider" not in kwargs
    ):
        kwargs.update(_polars_options(path0))

    open_files = []

    try:
        for path in path_list:
            open_files.append(path.open(mode="rb") if use_fsspec else path.as_posix())

        yield scans[path_list[0].suffix](open_files, **kwargs)
    finally:
        if use_fsspec:
            for open_file in open_files:
                open_file.close()


================================================
FILE: lamindb/core/storage/_pyarrow_dataset.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

import pyarrow.dataset
from lamindb_setup.core.upath import LocalPathClasses

if TYPE_CHECKING:
    from pyarrow.dataset import Dataset as PyArrowDataset
    from upath import UPath


PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc")


def _open_pyarrow_dataset(paths: UPath | list[UPath], **kwargs) -> PyArrowDataset:
    if isinstance(paths, list):
        # a single path can be a directory, but a list of paths
        # has to be a flat list of files
        paths_str = []
        path0 = paths[0]
        if isinstance(path0, LocalPathClasses):
            path_to_str = lambda p: p.as_posix()
            filesystem = None
        else:
            path_to_str = lambda p: p.path
            filesystem = path0.fs
        for path in paths:
            if (
                getattr(path, "protocol", None) not in {"http", "https"}
                and path.is_dir()
            ):
                paths_str += [path_to_str(p) for p in path.rglob("*") if p.suffix != ""]
            else:
                paths_str.append(path_to_str(path))
    elif isinstance(paths, LocalPathClasses):
        paths_str, filesystem = paths.as_posix(), None
    else:
        paths_str, filesystem = paths.path, paths.fs

    return pyarrow.dataset.dataset(paths_str, filesystem=filesystem, **kwargs)


================================================
FILE: lamindb/core/storage/_spatialdata_accessor.py
================================================
from __future__ import annotations

from functools import cached_property
from typing import TYPE_CHECKING

from ._anndata_accessor import AnnDataAccessor

if TYPE_CHECKING:
    from zarr import Group

    from lamindb import Artifact


class _TablesAccessor:
    def __init__(self, tables: Group, artifact: Artifact | None = None):
        self._tables = tables

        self._artifact = artifact

    def __getitem__(self, key: str) -> AnnDataAccessor:
        return AnnDataAccessor(
            connection=None,
            storage=self._tables[key],
            filename=key,
            artifact=self._artifact,
        )

    def keys(self) -> list[str]:
        return list(self._tables.keys())

    def __repr__(self) -> str:
        """Description of the _TablesAccessor object."""
        descr = (
            f"Accessor for the SpatialData attribute tables\n  with keys: {self.keys()}"
        )
        return descr


class SpatialDataAccessor:
    """Cloud-backed SpatialData.

    For now only allows to access `tables`.
    """

    def __init__(self, storage: Group, name: str, artifact: Artifact | None = None):
        self.storage = storage
        self._name = name

        self._artifact = artifact

    @cached_property
    def tables(self) -> _TablesAccessor:
        """tables of the underlying SpatialData object."""
        return _TablesAccessor(self.storage["tables"], self._artifact)

    def __repr__(self):
        """Description of the SpatialDataAccessor object."""
        descr = (
            "SpatialDataAccessor object"
            f"\n  constructed for the SpatialData object {self._name}"
            f"\n    with tables: {self.tables.keys()}"
        )
        return descr


================================================
FILE: lamindb/core/storage/_tiledbsoma.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Literal
from urllib.parse import urlparse

import pandas as pd
import pyarrow as pa
from anndata import AnnData, read_h5ad
from lamin_utils import logger
from lamindb_setup import settings as setup_settings
from lamindb_setup.core.upath import (
    LocalPathClasses,
    _ensure_sync_with_fs,
    create_path,
    get_storage_region,
)
from packaging import version

if TYPE_CHECKING:
    from lamindb_setup.types import AnyPathStr
    from tiledbsoma import Collection as SOMACollection
    from tiledbsoma import Experiment as SOMAExperiment
    from tiledbsoma import Measurement as SOMAMeasurement
    from tiledbsoma import SOMATileDBContext
    from upath import UPath

    from lamindb.models.artifact import Artifact
    from lamindb.models.run import Run


def _load_h5ad_zarr(objpath: UPath):
    from lamindb.core.loaders import load_h5ad, load_zarr

    if objpath.is_dir():
        adata = load_zarr(objpath, expected_type="anndata")
    else:
        # read only local in backed for now
        # in principle possible to read remote in backed also
        if isinstance(objpath, LocalPathClasses):
            adata = read_h5ad(objpath.as_posix(), backed="r")
        else:
            adata = load_h5ad(objpath)
    return adata


class SOMAS3ContextFactory:
    """Prepares and caches soma.SOMATileDBContext for a given storepath.

    For S3 storage with federated credentials, credentials are
    read and refreshed only when the store is opened—i.e. when
    :meth:`get_context` is called as part of opening the TileDB-SOMA store.
    They are not updated while a store handle is held open. If credentials
    expire during a long-lived session, close the store and open it again to
    refresh.
    """

    def __init__(self, storepath: UPath):
        from tiledbsoma import SOMATileDBContext

        self._refreshable_credentials = None

        fs = storepath.fs
        fs.connect()
        self._fs = fs

        tiledb_config = {}

        endpoint_url = fs.endpoint_url
        if endpoint_url is not None:
            tiledb_config["vfs.s3.region"] = ""
            tiledb_config["vfs.s3.use_virtual_addressing"] = "false"
            parsed = urlparse(endpoint_url)
            tiledb_config["vfs.s3.scheme"] = parsed.scheme
            tiledb_config["vfs.s3.endpoint_override"] = (
                parsed._replace(scheme="").geturl().lstrip("/")
            )
        else:
            tiledb_config["vfs.s3.region"] = get_storage_region(storepath)

        if fs.anon:
            tiledb_config["vfs.s3.no_sign_request"] = "true"
            tiledb_config["vfs.s3.aws_access_key_id"] = ""
            tiledb_config["vfs.s3.aws_secret_access_key"] = ""
            tiledb_config["vfs.s3.aws_session_token"] = ""
        else:
            aws_key = fs.key
            aws_secret = fs.secret
            aws_token = fs.token
            if aws_key is not None and aws_secret is not None:
                tiledb_config["vfs.s3.aws_access_key_id"] = aws_key
                tiledb_config["vfs.s3.aws_secret_access_key"] = aws_secret
                if aws_token is not None:
                    tiledb_config["vfs.s3.aws_session_token"] = aws_token
            else:
                from aiobotocore.credentials import AioRefreshableCredentials

                if isinstance(
                    refreshable_credentials := fs.session._credentials,
                    AioRefreshableCredentials,
                ):
                    self._refreshable_credentials = refreshable_credentials
                    tiledb_config.update(self._extract_refreshable_credentials())

        self._context = SOMATileDBContext(tiledb_config=tiledb_config)

    def _extract_refreshable_credentials(self) -> dict:
        tiledb_config: dict[str, str] = {}

        refreshable_credentials = self._refreshable_credentials
        if refreshable_credentials is None:
            return tiledb_config
        # refresh and retrieve the credentials
        _ensure_sync_with_fs(refreshable_credentials._refresh, self._fs)()
        tiledb_config["vfs.s3.aws_access_key_id"] = refreshable_credentials._access_key
        tiledb_config["vfs.s3.aws_secret_access_key"] = (
            refreshable_credentials._secret_key
        )
        if (aws_token := refreshable_credentials._token) is not None:
            tiledb_config["vfs.s3.aws_session_token"] = aws_token

        return tiledb_config

    def get_context(self) -> SOMATileDBContext:
        # update the credentials if needed and return the updated context
        refreshed_credentials = self._extract_refreshable_credentials()
        if refreshed_credentials:
            self._context = self._context.replace(tiledb_config=refreshed_credentials)

        return self._context


def _open_tiledbsoma(
    storepath: UPath, mode: Literal["r", "w"] = "r"
) -> SOMACollection | SOMAExperiment | SOMAMeasurement:
    """Open a TileDB-SOMA store for the given path.

    For S3 paths with federated credentials, credentials are refreshed at
    open time only (see :class:`SOMAS3ContextFactory`).
    """
    try:
        import tiledbsoma as soma
    except ImportError as e:
        raise ImportError("Please install tiledbsoma: pip install tiledbsoma") from e

    storepath_str = storepath.as_posix()
    if storepath.protocol == "s3":
        ctx = SOMAS3ContextFactory(storepath).get_context()
        # this is a strange bug
        # for some reason iterdir futher gives incorrect results
        # if cache is not invalidated
        # instead of obs and ms it gives ms and ms in the list of names
        storepath.fs.invalidate_cache()
    else:
        ctx = None

    soma_objects = [obj.name for obj in storepath.iterdir()]
    if "obs" in soma_objects and "ms" in soma_objects:
        SOMAType = soma.Experiment
    elif "var" in soma_objects:
        SOMAType = soma.Measurement
    else:
        SOMAType = soma.Collection
    return SOMAType.open(storepath_str, mode=mode, context=ctx)


def save_tiledbsoma_experiment(
    # Artifact args
    adatas: list[AnnData | AnyPathStr],
    key: str | None = None,
    description: str | None = None,
    run: Run | None = None,
    revises: Artifact | None = None,
    # tiledbsoma.io.from_anndata args
    measurement_name: str = "RNA",
    obs_id_name: str = "obs_id",
    var_id_name: str = "var_id",
    append_obsm_varm: bool = False,
    # additional keyword args for tiledbsoma.io.from_anndata
    **kwargs,
) -> Artifact:
    """Write `AnnData` to `tiledbsoma.Experiment`.

    Reads `AnnData` objects, writes them to `tiledbsoma.Experiment`, creates & saves an :class:`~lamindb.Artifact`.

    Populates a column `lamin_run_uid` column in `obs` with the current `run.uid`.

    Is based on `tiledbsoma.io.from_anndata
    <https://tiledbsoma.readthedocs.io/en/latest/_autosummary/tiledbsoma.io.from_anndata.html>`__.

    Args:
        adatas: `AnnData` objects to write, in-memory or on-disk.
        key: An optional key to reference the artifact.
        description: A description.
        run: The run that creates the artifact.
        revises: `lamindb.Artifact` with `tiledbsoma.Experiment` to append to.
        measurement_name: The name of the measurement to store data in `tiledbsoma.Experiment`.
        obs_id_name: Which `AnnData` `obs` column to use for append mode.
        var_id_name: Which `AnnData` `var` column to use for append mode.
        append_obsm_varm: Whether to append `obsm` and `varm` in append mode .
        **kwargs: Keyword arguments passed to `tiledbsoma.io.from_anndata`.

    Note:
        For S3 storage with federated credentials, credentials are
        updated only when the store is opened for each write step, not while a
        store handle is held open. Retry if credentials expire during a long write operation.
    """
    try:
        import tiledbsoma as soma
        import tiledbsoma.io as soma_io
    except ImportError as e:
        raise ImportError("Please install tiledbsoma: pip install tiledbsoma") from e

    from lamindb.core.storage.paths import auto_storage_key_from_artifact_uid
    from lamindb.models import Artifact
    from lamindb.models._is_versioned import create_uid
    from lamindb.models.artifact import get_run

    run = get_run(run)

    appending = revises is not None
    if appending:
        storepath = revises.path
    else:
        uid, _ = create_uid(n_full_id=20)
        storage_key = auto_storage_key_from_artifact_uid(
            uid, ".tiledbsoma", overwrite_versions=True
        )
        storepath = setup_settings.storage.root / storage_key

    if storepath.protocol == "s3":  # type: ignore
        ctx_factory = SOMAS3ContextFactory(storepath)
    else:
        ctx_factory = None

    storepath_str = storepath.as_posix()

    add_run_uid = True
    run_uid_dtype = "category"
    if appending:
        ctx = None if ctx_factory is None else ctx_factory.get_context()
        with soma.Experiment.open(storepath_str, mode="r", context=ctx) as store:
            obs_schema = store["obs"].schema
            add_run_uid = "lamin_run_uid" in obs_schema.names
            # this is needed to enable backwards compatibility with tiledbsoma stores
            # created before PR 2300
            if add_run_uid:
                column_type = obs_schema.types[obs_schema.names.index("lamin_run_uid")]
                if not isinstance(column_type, pa.DictionaryType):
                    run_uid_dtype = None

    if add_run_uid and run is None:
        raise ValueError("Pass `run`")

    adata_objects = []
    for adata in adatas:
        if isinstance(adata, AnnData):
            if add_run_uid and adata.is_view:
                raise ValueError(
                    "Can not write an `AnnData` view, please do `adata.copy()` before passing."
                )
        else:
            adata = _load_h5ad_zarr(create_path(adata))
        if add_run_uid:
            adata.obs["lamin_run_uid"] = pd.Series(
                run.uid, index=adata.obs.index, dtype=run_uid_dtype
            )
        adata_objects.append(adata)

    registration_mapping = kwargs.get("registration_mapping", None)
    if registration_mapping is None and (appending or len(adata_objects) > 1):
        ctx = None if ctx_factory is None else ctx_factory.get_context()
        registration_mapping = soma_io.register_anndatas(
            experiment_uri=storepath_str if appending else None,
            adatas=adata_objects,
            measurement_name=measurement_name,
            obs_field_name=obs_id_name,
            var_field_name=var_id_name,
            append_obsm_varm=append_obsm_varm,
            context=ctx,
        )

    prepare_experiment = False
    resize_experiment = False
    if registration_mapping is not None:
        soma_version_parsed = version.parse(soma.__version__)
        if soma_version_parsed < version.parse("1.15.0rc4"):
            n_observations = len(registration_mapping.obs_axis.data)
        else:
            n_observations = registration_mapping.get_obs_shape()
            prepare_experiment = soma_version_parsed >= version.parse("1.16.2")
            resize_experiment = not prepare_experiment
    else:  # happens only if not appending and only one adata passed
        assert len(adata_objects) == 1  # noqa: S101
        n_observations = adata_objects[0].n_obs

    logger.important(f"writing the tiledbsoma store to {storepath_str}")
    experiment_exists: bool | None = None
    for adata_obj in adata_objects:
        # do not recheck if True
        if not experiment_exists and (resize_experiment or prepare_experiment):
            ctx = None if ctx_factory is None else ctx_factory.get_context()
            experiment_exists = soma.Experiment.exists(storepath_str, context=ctx)
        if experiment_exists:
            # both can only happen if registration_mapping is not None
            if resize_experiment:
                ctx = None if ctx_factory is None else ctx_factory.get_context()
                soma_io.resize_experiment(
                    storepath_str,
                    nobs=n_observations,
                    nvars=registration_mapping.get_var_shapes(),
                    context=ctx,
                )
                resize_experiment = False
            elif prepare_experiment:
                ctx = None if ctx_factory is None else ctx_factory.get_context()
                registration_mapping.prepare_experiment(storepath_str, context=ctx)
                prepare_experiment = False
        registration_mapping_write = (
            registration_mapping.subset_for_anndata(adata_obj)
            if hasattr(registration_mapping, "subset_for_anndata")
            else registration_mapping
        )
        ctx = None if ctx_factory is None else ctx_factory.get_context()
        soma_io.from_anndata(
            storepath_str,
            adata_obj,
            measurement_name,
            context=ctx,
            obs_id_name=obs_id_name,
            var_id_name=var_id_name,
            registration_mapping=registration_mapping_write,
            **kwargs,
        )

    artifact = Artifact(  # type: ignore
        storepath,
        key=key,
        description=description,
        run=run,
        revises=revises,
        _is_internal_call=True,
    )
    artifact.n_observations = n_observations
    artifact.otype = "tiledbsoma"

    return artifact.save()


# this is less defensive than _anndata_n_observations
# this doesn't really catches errors
# assumes that the tiledbsoma object is well-formed
def _soma_store_n_observations(obj) -> int:
    if obj.soma_type in {"SOMADataFrame", "SOMASparseNDArray", "SOMADenseNDArray"}:
        return obj.non_empty_domain()[0][1] + 1
    elif obj.soma_type == "SOMAExperiment":
        return _soma_store_n_observations(obj["obs"])
    elif obj.soma_type == "SOMAMeasurement":
        keys = obj.keys()
        for slot in ("X", "obsm", "obsp"):
            if slot in keys:
                return _soma_store_n_observations(next(iter(obj[slot].values())))
    elif obj.soma_type == "SOMACollection":
        n_obs = 0
        for value in obj.values():
            n_obs += _soma_store_n_observations(value)
        return n_obs
    raise ValueError(
        "Could not infer the number of observations from the tiledbsoma object."
    )


def _soma_n_observations(objectpath: UPath) -> int:
    with _open_tiledbsoma(objectpath, mode="r") as store:
        return _soma_store_n_observations(store)


================================================
FILE: lamindb/core/storage/_valid_suffixes.py
================================================
from __future__ import annotations

from lamindb_setup.core.upath import VALID_COMPOSITE_SUFFIXES, VALID_SIMPLE_SUFFIXES

# add new composite suffixes like so
VALID_COMPOSITE_SUFFIXES.update(
    {
        ".vitessce.json",
        ".ome.zarr",
    }
)
# can do the same for simple valid suffixes


class VALID_SUFFIXES:
    """Valid suffixes."""

    SIMPLE: set[str] = VALID_SIMPLE_SUFFIXES
    """Simple suffixes."""
    COMPOSITE: set[str] = VALID_COMPOSITE_SUFFIXES
    """Composite suffixes."""


================================================
FILE: lamindb/core/storage/_zarr.py
================================================
from __future__ import annotations

from importlib.metadata import version as get_version
from typing import TYPE_CHECKING, Literal

import zarr
from lamin_utils import logger
from lamindb_setup.core.upath import LocalPathClasses, S3FSMap, UPath, create_mapper
from packaging import version

from lamindb.core._compat import with_package

if version.parse(get_version("anndata")) < version.parse("0.11.0"):
    from anndata._io import read_zarr as read_anndata_zarr
else:
    from anndata.io import read_zarr as read_anndata_zarr

if version.parse(zarr.__version__) >= version.parse("3.0.0a0"):
    IS_ZARR_V3 = True
    from zarr.abc.store import Store
else:
    IS_ZARR_V3 = False
    from zarr.storage import Store  # noqa

if TYPE_CHECKING:
    from fsspec import FSMap
    from lamindb_setup.types import AnyPathStr

    from lamindb.core.storage.types import ScverseDataStructures


def get_zarr_store(
    path: AnyPathStr, *, check: bool = False, create: bool = False
) -> str | S3FSMap | FSMap | Store:
    """Creates the correct object that can be used to open a zarr file depending on local or remote location."""
    storepath, storepath_str = UPath(path), str(path)
    if isinstance(storepath, LocalPathClasses):
        store = storepath_str
    elif IS_ZARR_V3:
        # todo: also check how to treat non-asynchronous filesystems
        # zarr has something for this, using fsspec async wrapper
        # check FsspecStore code
        store = zarr.storage.FsspecStore.from_upath(UPath(storepath, asynchronous=True))
    else:
        store = create_mapper(storepath.fs, storepath_str, check=check, create=create)

    return store


def _identify_zarr_type_from_storage(
    storage: zarr.Group,
) -> Literal["anndata", "mudata", "spatialdata", "unknown"]:
    """Internal helper to identify zarr type from an open storage object."""
    try:
        if storage.attrs.get("encoding-type", "") == "anndata":
            return "anndata"
        elif storage.attrs.get("encoding-type", "") == "MuData":
            return "mudata"
        elif "spatialdata_attrs" in storage.attrs:
            return "spatialdata"
    except Exception as error:
        logger.warning(f"an exception occurred {error}")
    return "unknown"


def identify_zarr_type(
    storepath: AnyPathStr, *, check: bool = True
) -> Literal["anndata", "mudata", "spatialdata", "unknown"]:
    """Identify whether a zarr store is AnnData, SpatialData, or unknown type."""
    suffixes = UPath(storepath).suffixes
    if ".anndata" in suffixes:
        return "anndata"
    elif ".mudata" in suffixes:
        return "mudata"
    elif ".spatialdata" in suffixes:
        return "spatialdata"

    store = get_zarr_store(storepath, check=check)
    try:
        storage = zarr.open(store, mode="r")
        return _identify_zarr_type_from_storage(storage)
    except Exception as error:
        logger.warning(
            f"an exception occured while trying to open the zarr store\n {error}"
        )
    return "unknown"


def load_zarr(
    storepath: AnyPathStr,
    expected_type: Literal["anndata", "mudata", "spatialdata"] = None,
) -> ScverseDataStructures:
    """Loads a zarr store and returns the corresponding scverse data structure.

    Args:
        storepath: Path to the zarr store
        expected_type: If provided, ensures the zarr store is of this type ("anndata", "mudata", "spatialdata")
                       and raises ValueError if it's not
    """
    store = get_zarr_store(storepath, check=True)
    # Open the storage once
    try:
        storage = zarr.open(store, mode="r")
    except Exception as error:
        raise ValueError(f"Could not open zarr store: {error}") from None

    actual_type = _identify_zarr_type_from_storage(storage)
    if expected_type is not None and actual_type != expected_type:
        raise ValueError(
            f"Expected zarr store of type '{expected_type}', but found '{actual_type}'"
        )

    match actual_type:
        case "anndata":
            scverse_obj = read_anndata_zarr(store)
        case "mudata":
            scverse_obj = with_package("mudata", lambda mod: mod.read_zarr(store))
        case "spatialdata":
            scverse_obj = with_package("spatialdata", lambda mod: mod.read_zarr(store))
        case "unknown" | _:
            raise ValueError(
                "Unable to determine zarr store format and therefore cannot load Artifact."
            )
    return scverse_obj


================================================
FILE: lamindb/core/storage/objects.py
================================================
from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING, Any, TypeAlias

from lamindb.core._compat import (
    with_package_obj,
)

if TYPE_CHECKING:
    from pandas import DataFrame

    from .types import ScverseDataStructures

    SupportedDataTypes: TypeAlias = DataFrame | ScverseDataStructures
else:
    SupportedDataTypes: TypeAlias = Any


def infer_suffix(
    dmem: SupportedDataTypes, format: str | dict[str, Any] | None = None
) -> str:
    """Infer LaminDB storage file suffix from a data object."""
    has_anndata, anndata_suffix = with_package_obj(
        dmem,
        "AnnData",
        "anndata",
        lambda obj: _infer_anndata_suffix(format),
    )
    if has_anndata:
        return anndata_suffix

    has_dataframe, dataframe_suffix = with_package_obj(
        dmem,
        "DataFrame",
        "pandas",
        lambda obj: _infer_dataframe_suffix(format),
    )
    if has_dataframe:
        return dataframe_suffix

    if with_package_obj(
        dmem,
        "MuData",
        "mudata",
        lambda obj: True,  # Just checking type, not calling any method
    )[0]:
        return ".h5mu"

    has_spatialdata, spatialdata_suffix = with_package_obj(
        dmem,
        "SpatialData",
        "spatialdata",
        lambda obj: _infer_spatialdata_suffix(format),
    )
    if has_spatialdata:
        return spatialdata_suffix
    else:
        raise NotImplementedError


def _infer_anndata_suffix(format: str | dict[str, Any] | None) -> str:
    assert not isinstance(format, dict)  # noqa: S101
    if format is not None:
        # should be `.h5ad`, `.`zarr`, or `.anndata.zarr`
        if format not in {"h5ad", "zarr", "anndata.zarr"}:
            raise ValueError(
                "Error when specifying AnnData storage format, it should be"
                f" 'h5ad', 'zarr', not '{format}'. Check 'format'"
                " or the suffix of 'key'."
            )
        return "." + format
    return ".h5ad"


def _infer_dataframe_suffix(format: str | dict[str, Any] | None) -> str:
    if isinstance(format, str):
        if format == ".csv":
            return ".csv"
    elif isinstance(format, dict):
        if format.get("suffix") == ".csv":
            return ".csv"
    return ".parquet"


def _infer_spatialdata_suffix(format: str | dict[str, Any] | None) -> str:
    if format is None:
        return ".zarr"
    if isinstance(format, str) and format in {"spatialdata.zarr", "zarr"}:
        return format
    raise ValueError(
        "Error when specifying SpatialData storage format, it should be"
        f" 'zarr', 'spatialdata.zarr', not '{format}'. Check 'format'"
        " or the suffix of 'key'."
    )


# for types below note that local UPaths are subclasses of Path
# Path(UPath(...)) properly coerces local UPaths and throws an error for cloud UPaths


def write_to_disk(dmem: SupportedDataTypes, filepath: Path | str, **kwargs) -> None:
    """Writes the passed in memory data to disk to a specified path."""
    if with_package_obj(
        dmem,
        "AnnData",
        "anndata",
        lambda obj: _write_anndata(obj, filepath, **kwargs),
    )[0]:
        return

    if with_package_obj(
        dmem,
        "DataFrame",
        "pandas",
        lambda obj: _write_dataframe(obj, filepath, **kwargs),
    )[0]:
        return

    if with_package_obj(dmem, "MuData", "mudata", lambda obj: obj.write(filepath))[0]:
        return

    if with_package_obj(
        dmem,
        "SpatialData",
        "spatialdata",
        lambda obj: obj.write(filepath, overwrite=True),
    )[0]:
        return

    raise NotImplementedError


def _write_anndata(dmem: Any, filepath: Path | str, **kwargs) -> None:
    suffix = Path(filepath).suffix
    if suffix == ".h5ad":
        dmem.write_h5ad(filepath, **kwargs)
        return
    elif suffix == ".zarr":
        dmem.write_zarr(filepath, **kwargs)
        return
    else:
        raise NotImplementedError


def _write_dataframe(dmem: Any, filepath: Path | str, **kwargs) -> None:
    suffix = Path(filepath).suffix
    if suffix == ".csv":
        dmem.to_csv(filepath, **kwargs)
        return
    dmem.to_parquet(filepath, **kwargs)


================================================
FILE: lamindb/core/storage/paths.py
================================================
from __future__ import annotations

import shutil
from typing import TYPE_CHECKING

import fsspec
from lamindb_setup.core import StorageSettings
from lamindb_setup.core.upath import (
    LocalPathClasses,
    UPath,
)

from lamindb.core._settings import settings

if TYPE_CHECKING:
    from lamindb_setup.types import AnyPath, AnyPathStr

    from lamindb.models.artifact import Artifact


AUTO_KEY_PREFIX = ".lamindb/"


# add type annotations back asap when re-organizing the module
def auto_storage_key_from_artifact(artifact: Artifact):
    if (real_key := artifact._real_key) is not None:
        return real_key
    key = artifact.key
    if key is None or artifact._key_is_virtual:
        return auto_storage_key_from_artifact_uid(
            artifact.uid, artifact.suffix, artifact.overwrite_versions
        )
    return artifact.key


def auto_storage_key_from_artifact_uid(
    uid: str, suffix: str, overwrite_versions: bool
) -> str:
    assert isinstance(suffix, str)  # noqa: S101 Suffix cannot be None.
    if overwrite_versions:
        uid_storage = uid[:16]  # 16 chars, leave 4 chars for versioning
    else:
        uid_storage = uid
    storage_key = f"{AUTO_KEY_PREFIX}{uid_storage}{suffix}"
    return storage_key


def check_path_is_child_of_root(path: AnyPathStr, root: AnyPathStr) -> bool:
    if fsspec.utils.get_protocol(str(path)) != fsspec.utils.get_protocol(str(root)):
        return False
    path_upath = UPath(path)
    root_upath = UPath(root)
    if path_upath.protocol == "s3":
        endpoint_path = path_upath.storage_options.get("endpoint_url", "")
        endpoint_root = root_upath.storage_options.get("endpoint_url", "")
        if endpoint_path != endpoint_root:
            return False
    # we don't resolve http links because they can resolve into a different domain
    # for example into a temporary url
    if path_upath.protocol not in {"http", "https"}:
        path_upath = path_upath.resolve()
        root_upath = root_upath.resolve()
    # str is needed to eliminate UPath storage_options
    # which affect equality checks
    return UPath(str(root_upath)) in UPath(str(path_upath)).parents


# returns filepath and root of the storage
def attempt_accessing_path(
    artifact: Artifact,
    storage_key: str,
    using_key: str | None = None,
    access_token: str | None = None,
) -> tuple[UPath, StorageSettings]:
    # check whether the file is in the default db and whether storage
    # matches default storage
    from lamindb.models import Storage

    if (
        artifact._state.db in ("default", None)
        and artifact.storage_id == settings._storage_settings._id
    ):
        if access_token is None:
            storage_settings = settings._storage_settings
        else:
            storage_settings = StorageSettings(
                settings.storage.root, access_token=access_token
            )
    else:
        if artifact._state.db not in ("default", None) and using_key is None:
            storage = Storage.connect(artifact._state.db).get(id=artifact.storage_id)
        else:
            storage = Storage.objects.using(using_key).get(id=artifact.storage_id)
        # find a better way than passing None to instance_settings in the future!
        storage_settings = StorageSettings(storage.root, access_token=access_token)
    path = storage_settings.key_to_filepath(storage_key)
    return path, storage_settings


def filepath_from_artifact(
    artifact: Artifact, using_key: str | None = None
) -> tuple[UPath, StorageSettings | None]:
    if (local_filepath := getattr(artifact, "_local_filepath", None)) is not None:
        return local_filepath.resolve(), None
    storage_key = auto_storage_key_from_artifact(artifact)
    path, storage_settings = attempt_accessing_path(
        artifact, storage_key, using_key=using_key
    )
    return path, storage_settings


# virtual key is taken into consideration
# only if the version is latest
def _cache_key_from_artifact_storage(
    artifact: Artifact, storage_settings: StorageSettings | None
):
    cache_key = None
    if (
        artifact._key_is_virtual
        and artifact.key is not None
        and storage_settings is not None
        and artifact.is_latest
    ):
        root = storage_settings.root
        cache_key = (root / artifact.key).path
        # .path does not strip protocol for http
        # have to do it manually
        if root.protocol in {"http", "https"}:
            cache_key = cache_key.split("://", 1)[-1]
    return cache_key


# return filepath and cache_key if needed
def filepath_cache_key_from_artifact(
    artifact: Artifact, using_key: str | None = None
) -> tuple[UPath, str | None]:
    filepath, storage_settings = filepath_from_artifact(artifact, using_key)
    if isinstance(filepath, LocalPathClasses):
        return filepath, None
    cache_key = _cache_key_from_artifact_storage(artifact, storage_settings)
    return filepath, cache_key


def store_file_or_folder(
    local_path: AnyPathStr, storage_path: UPath, print_progress: bool = True, **kwargs
) -> None:
    """Store file or folder (localpath) at storagepath."""
    local_path = UPath(local_path)
    if not isinstance(storage_path, LocalPathClasses):
        # this uploads files and directories
        if local_path.is_dir():
            create_folder = False
            try:
                # if storage_path already exists we need to delete it
                # if local_path is a directory
                # to replace storage_path correctly
                if storage_path.stat().as_info()["type"] == "directory":
                    storage_path.rmdir()
                else:
                    storage_path.unlink()
            except (FileNotFoundError, PermissionError):
                pass
        else:
            create_folder = None
        storage_path.upload_from(
            local_path,
            create_folder=create_folder,
            print_progress=print_progress,
            **kwargs,
        )
    else:  # storage path is local
        if local_path.resolve().as_posix() == storage_path.resolve().as_posix():
            return None
        storage_path.parent.mkdir(parents=True, exist_ok=True)
        if local_path.is_file():
            shutil.copyfile(local_path, storage_path)
        else:
            if storage_path.exists():
                shutil.rmtree(storage_path)
            shutil.copytree(local_path, storage_path)


def delete_storage_using_key(
    artifact: Artifact,
    storage_key: str,
    raise_file_not_found_error: bool = True,
    using_key: str | None = None,
) -> None | str:
    filepath, _ = attempt_accessing_path(artifact, storage_key, using_key=using_key)
    return delete_storage(
        filepath, raise_file_not_found_error=raise_file_not_found_error
    )


def delete_storage(
    storagepath: AnyPath, raise_file_not_found_error: bool = True
) -> None | str:
    """Delete arbitrary artifact."""
    if storagepath.is_file():
        storagepath.unlink()
    elif storagepath.is_dir():
        if isinstance(storagepath, LocalPathClasses):
            shutil.rmtree(storagepath)
        else:
            storagepath.rmdir()
    elif raise_file_not_found_error:
        raise FileNotFoundError(f"{storagepath} is not an existing path!")
    else:
        return "did-not-delete"
    return None


================================================
FILE: lamindb/core/storage/types.py
================================================
"""Storage-related type definitions."""

from __future__ import annotations

from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
    from anndata import AnnData
    from mudata import MuData
    from spatialdata import SpatialData

    ScverseDataStructures = AnnData | MuData | SpatialData
else:
    # AnnData | MuData | SpatialData; Any required for union with DataFrame in objects.py
    ScverseDataStructures = Any


================================================
FILE: lamindb/core/subsettings/__init__.py
================================================
"""Sub settings.

.. autoclass:: CreationSettings
.. autoclass:: AnnotationSettings

"""

from ._annotation_settings import AnnotationSettings
from ._creation_settings import CreationSettings


================================================
FILE: lamindb/core/subsettings/_annotation_settings.py
================================================
class AnnotationSettings:
    n_max_records: int = 1000
    """Maximal number of records to annotate with during automated annotation.

    If the number of records to annotate exceeds this limit, print a warning and do not annotate.

    The number is calculated per feature for labels, and per schema for features.
    """


annotation_settings = AnnotationSettings()


================================================
FILE: lamindb/core/subsettings/_creation_settings.py
================================================
class CreationSettings:
    search_names: bool = True
    """Switch off to speed up creating records (default `True`).

    If `True`, search for alternative names and avoids duplicates.

    FAQ: :doc:`/faq/idempotency`
    """
    artifact_skip_size_hash: bool = False
    """To speed up registering high numbers of files (default `False`).

    This bypasses queries for size and hash to AWS & GCP.

    It speeds up file creation by about a factor 100.
    """
    artifact_silence_missing_run_warning: bool = False
    """Silence warning about missing run & transform during artifact creation (default `False`)."""
    _artifact_use_virtual_keys: bool = True
    """Treat `key` parameter in :class:`~lamindb.Artifact` as virtual.

    If `True`, the `key` is **not** used to construct file paths, but file paths are
    based on the `uid` of artifact.
    """


creation_settings = CreationSettings()


================================================
FILE: lamindb/curators/__init__.py
================================================
"""Curators.

High-level curators
-------------------

.. autoclass:: DataFrameCurator
.. autoclass:: AnnDataCurator
.. autoclass:: MuDataCurator
.. autoclass:: SpatialDataCurator
.. autoclass:: TiledbsomaExperimentCurator

Low-level module
----------------

.. autosummary::
   :toctree: .

   core

"""

from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from .core import (
        AnnDataCurator,
        DataFrameCurator,
        MuDataCurator,
        SpatialDataCurator,
        TiledbsomaExperimentCurator,
    )

__all__ = [
    "AnnDataCurator",
    "DataFrameCurator",
    "MuDataCurator",
    "SpatialDataCurator",
    "TiledbsomaExperimentCurator",
]

_CURATOR_NAMES = frozenset(__all__)


def __getattr__(name: str):
    """Lazy-import curators from core to avoid loading pandas/pandera at import."""
    if name in _CURATOR_NAMES:
        from . import core

        attr = getattr(core, name)
        globals()[name] = attr
        return attr
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


================================================
FILE: lamindb/curators/core.py
================================================
"""Curator utilities.

.. autoclass:: Curator
.. autoclass:: SlotsCurator
.. autoclass:: ComponentCurator
.. autoclass:: CatVector
.. autoclass:: CatLookup
.. autoclass:: DataFrameCatManager

"""

from __future__ import annotations

import copy
import re
from typing import TYPE_CHECKING, Any, Callable

import lamindb_setup as ln_setup
import numpy as np
import pandas as pd
import pandera.pandas as pandera
from django.db.models import Q
from lamin_utils import colors, logger
from lamindb_setup.core._docs import doc_args
from lamindb_setup.core.upath import LocalPathClasses

from lamindb.base.dtypes import check_dtype
from lamindb.base.types import FieldAttr  # noqa
from lamindb.models import (
    Artifact,
    Feature,
    Run,
    Schema,
    SQLRecord,
)
from lamindb.models._from_values import _format_values, _from_values
from lamindb.models.artifact import (
    data_is_scversedatastructure,
    data_is_soma_experiment,
)
from lamindb.models.feature import (
    parse_cat_dtype,
    parse_dtype,
    parse_filter_string,
    resolve_relation_filters,
)
from lamindb.models.query_set import BasicQuerySet, SQLRecordList
from lamindb.models.sqlrecord import HasType

from ..errors import InvalidArgument, ValidationError
from ..models._from_values import get_organism_record_from_field
from ..models.feature import get_record_type_from_uid

if TYPE_CHECKING:
    from collections.abc import Iterable
    from typing import Any

    from anndata import AnnData
    from mudata import MuData
    from spatialdata import SpatialData
    from tiledbsoma._experiment import Experiment as SOMAExperiment

    from lamindb.core.storage.types import ScverseDataStructures


def strip_ansi_codes(text):
    # This pattern matches ANSI escape sequences
    ansi_pattern = re.compile(r"\x1b\[[0-9;]*m")
    return ansi_pattern.sub("", text)


class CatLookup:
    """Lookup categories from the reference instance.

    Args:
        categoricals: A dictionary of categorical fields to lookup.
        slots: A dictionary of slot fields to lookup.
        public: Whether to lookup from the public instance. Defaults to False.

    Example::

        curator = ln.curators.DataFrameCurator(...)
        curator.cat.lookup()["cell_type"].alveolar_type_1_fibroblast_cell

    """

    def __init__(
        self,
        categoricals: list[Feature] | dict[str, FieldAttr],
        slots: dict[str, FieldAttr] = None,
        public: bool = False,
        sources: dict[str, SQLRecord] | None = None,
    ) -> None:
        slots = slots or {}
        if isinstance(categoricals, list):
            categoricals = {
                feature.name: parse_dtype(feature._dtype_str)[0]["field"]
                for feature in categoricals
            }
        self._categoricals = {**categoricals, **slots}
        self._public = public
        self._sources = sources

    def __getattr__(self, name):
        if name in self._categoricals:
            registry = self._categoricals[name].field.model
            if self._public and hasattr(registry, "public"):
                return registry.public(source=self._sources.get(name)).lookup()
            else:
                return registry.lookup()
        raise AttributeError(
            f'"{self.__class__.__name__}" object has no attribute "{name}"'
        )

    def __getitem__(self, name):
        if name in self._categoricals:
            registry = self._categoricals[name].field.model
            if self._public and hasattr(registry, "public"):
                return registry.public(source=self._sources.get(name)).lookup()
            else:
                return registry.lookup()
        raise AttributeError(
            f'"{self.__class__.__name__}" object has no attribute "{name}"'
        )

    def __repr__(self) -> str:
        if len(self._categoricals) > 0:
            getattr_keys = "\n ".join(
                [f".{key}" for key in self._categoricals if key.isidentifier()]
            )
            getitem_keys = "\n ".join(
                [str([key]) for key in self._categoricals if not key.isidentifier()]
            )
            ref = "public" if self._public else "registries"
            return (
                f"Lookup objects from the {colors.italic(ref)}:\n "
                f"{colors.green(getattr_keys)}\n "
                f"{colors.green(getitem_keys)}\n"
                'Example:\n    → categories = curator.lookup()["cell_type"]\n'
                "    → categories.alveolar_type_1_fibroblast_cell\n\n"
                "To look up public ontologies, use .lookup(public=True)"
            )
        else:  # pragma: no cover
            return colors.warning("No fields are found!")


CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""


SLOTS_DOCSTRING = """Access sub curators by slot."""

SLOTS_DETAILS_DOCSTRING = """Uses **slots** to specify which component contains which schema. Slots are keys that identify where features are stored within composite data structures."""

VALIDATE_DOCSTRING = """Validate dataset against Schema.

Raises:
    lamindb.errors.ValidationError: If validation fails.
"""

SAVE_ARTIFACT_DOCSTRING = """Save an annotated artifact.

Args:
    key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
    description: A description.
    revises: Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
    run: The run that creates the artifact.

Returns:
    A saved artifact record.
"""

LAMINDB_COLUMN_PREFIX_REGEX = r"^__lamindb_.*$"


class Curator:
    """Curator base class.

    A `Curator` object makes it easy to validate, standardize & annotate datasets.

    See:
        - :class:`~lamindb.curators.DataFrameCurator`
        - :class:`~lamindb.curators.AnnDataCurator`
        - :class:`~lamindb.curators.MuDataCurator`
        - :class:`~lamindb.curators.SpatialDataCurator`
        - :class:`~lamindb.curators.TiledbsomaExperimentCurator`
    """

    def __init__(
        self,
        dataset: Any,
        schema: Schema,
        *,
        features: dict[str, Any] | None = None,
        require_saved_schema: bool = True,
    ) -> None:
        if not isinstance(schema, Schema):
            raise InvalidArgument("schema argument must be a Schema record.")
        if require_saved_schema and schema.pk is None:
            raise ValueError(
                "Schema must be saved before curation. Please save it using '.save()'."
            )
        self._artifact: Artifact | None = None
        self._dataset: Any = None
        # self._dataset is set below, it is opened or loaded if dataset is an Artifact
        if isinstance(dataset, Artifact):
            self._artifact = dataset
            if self._artifact.otype in {
                "DataFrame",
                "AnnData",
                "MuData",
                "SpatialData",
            }:
                if (
                    not isinstance(self._artifact.path, LocalPathClasses)
                    and self._artifact.otype == "AnnData"
                ):
                    try:
                        self._dataset = self._artifact.open(mode="r")
                        logger.important(
                            "opened remote artifact for streaming during validation"
                        )
                    except Exception as e:
                        logger.warning(
                            f"unable to open remote AnnData Artifact: {e}, falling back to loading into memory"
                        )
                if self._dataset is None:
                    logger.important("loading artifact into memory for validation")
                    self._dataset = self._artifact.load(is_run_input=False)
            else:
                raise InvalidArgument(
                    f"Cannot load or open artifact of this type: {self._artifact}"
                )
        else:
            self._dataset = dataset
        self._schema: Schema = schema
        self._external_features: dict[str, Any] = features
        self._is_validated: bool = False

    @doc_args(VALIDATE_DOCSTRING)
    def validate(self) -> bool | str:
        """{}"""  # noqa: D415
        pass  # pragma: no cover

    @doc_args(SAVE_ARTIFACT_DOCSTRING)
    def save_artifact(
        self,
        *,
        key: str | None = None,
        description: str | None = None,
        revises: Artifact | None = None,
        run: Run | None = None,
    ) -> Artifact:
        """{}"""  # noqa: D415
        # Note that this docstring has to be consistent with the Artifact()
        # constructor signature
        pass  # pragma: no cover

    def __repr__(self) -> str:
        from lamin_utils import colors

        if self._schema is not None:
            # Schema might have different attributes
            if hasattr(self._schema, "name") and self._schema.name:
                schema_str = colors.italic(self._schema.name)
            elif hasattr(self._schema, "uid"):
                schema_str = colors.italic(f"uid={self._schema.uid}")
            elif hasattr(self._schema, "id"):
                schema_str = colors.italic(f"id={self._schema.id}")
            else:
                schema_str = colors.italic("unnamed")

            # Add schema type info if available
            if hasattr(self._schema, "otype") and self._schema.otype:
                schema_str += f" ({self._schema.otype})"
        else:
            schema_str = colors.warning("None")

        status_str = ""
        if self._is_validated:
            status_str = f", {colors.green('validated')}"
        else:
            status_str = f", {colors.yellow('unvalidated')}"

        cls_name = colors.green(self.__class__.__name__)

        # Get additional info based on curator type
        extra_info = ""
        if hasattr(self, "_slots") and self._slots:
            # For SlotsCurator and its subclasses
            slots_count = len(self._slots)
            if slots_count > 0:
                slot_names = list(self._slots.keys())
                if len(slot_names) <= 3:
                    extra_info = f", slots: {slot_names}"
                else:
                    extra_info = f", slots: [{', '.join(slot_names[:3])}... +{len(slot_names) - 3} more]"
        elif (
            cls_name == "DataFrameCurator"
            and hasattr(self, "cat")
            and hasattr(self.cat, "_categoricals")
        ):
            # For DataFrameCurator
            cat_count = len(getattr(self.cat, "_categoricals", []))
            if cat_count > 0:
                extra_info = f", categorical_features={cat_count}"

        artifact_info = ""
        if self._artifact is not None:
            artifact_info = f", artifact: {colors.italic(self._artifact.uid)}"

        return (
            f"{cls_name}{artifact_info}(Schema: {schema_str}{extra_info}{status_str})"
        )


@doc_args(SLOTS_DETAILS_DOCSTRING)
class SlotsCurator(Curator):
    """Curator for a dataset with slots.

    {}

    Args:
        dataset: The dataset to validate & annotate.
        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
    """

    def __init__(
        self,
        dataset: Artifact | ScverseDataStructures | SOMAExperiment,
        schema: Schema,
        *,
        features: dict[str, Any] | None = None,
        require_saved_schema: bool = True,
    ) -> None:
        super().__init__(
            dataset=dataset,
            schema=schema,
            features=features,
            require_saved_schema=require_saved_schema,
        )
        self._slots: dict[str, ComponentCurator] = {}

        # used for multimodal data structures (not AnnData)
        # in form of {table/modality_key: var_field}
        self._var_fields: dict[str, FieldAttr] = {}
        # in form of {table/modality_key: categoricals}
        self._cat_vectors: dict[str, dict[str, CatVector]] = {}

    @property
    @doc_args(SLOTS_DOCSTRING)
    def slots(self) -> dict[str, ComponentCurator]:
        """{}"""  # noqa: D415
        return self._slots

    @doc_args(VALIDATE_DOCSTRING)
    def validate(self) -> None:
        """{}"""  # noqa: D415
        if "__external__" in self._schema.slots:
            validation_schema = self._schema.slots["__external__"]
            if not self._external_features:
                if self._artifact is not None and not self._artifact._state.adding:
                    logger.important(
                        "no new external features provided, using existing external features of artifact for validation"
                    )
                    self._external_features = self._artifact.features.get_values(
                        external_only=True
                    )
                else:
                    raise ValidationError(
                        "External features slot is defined in schema but no external features were provided."
                    )
            ExperimentalDictCurator(
                self._external_features, validation_schema
            ).validate()
        for slot, curator in self._slots.items():
            logger.debug(f"validating slot {slot} ...")
            curator.validate()
        # set _is_validated to True as no slot raised an error
        self._is_validated = True

    @doc_args(SAVE_ARTIFACT_DOCSTRING)
    def save_artifact(
        self,
        *,
        key: str | None = None,
        description: str | None = None,
        revises: Artifact | None = None,
        run: Run | None = None,
    ) -> Artifact:
        """{}"""  # noqa: D415
        if not self._is_validated:
            self.validate()

        if self._artifact is None:
            type_mapping = [
                (
                    lambda dataset: isinstance(dataset, pd.DataFrame),
                    Artifact.from_dataframe,
                ),
                (
                    lambda dataset: data_is_scversedatastructure(dataset, "AnnData"),
                    Artifact.from_anndata,
                ),
                (
                    lambda dataset: data_is_scversedatastructure(dataset, "MuData"),
                    Artifact.from_mudata,
                ),
                (
                    lambda dataset: data_is_scversedatastructure(
                        dataset, "SpatialData"
                    ),
                    Artifact.from_spatialdata,
                ),
                (data_is_soma_experiment, Artifact.from_tiledbsoma),
            ]
            for type_check, af_constructor in type_mapping:
                if type_check(self._dataset):
                    self._artifact = af_constructor(  # type: ignore
                        self._dataset,
                        key=key,
                        description=description,
                        revises=revises,
                        run=run,
                    )
                    break
        cat_vectors = {}
        for curator in self._slots.values():
            for key, cat_vector in curator.cat._cat_vectors.items():
                cat_vectors[key] = cat_vector
        self._artifact.schema = self._schema
        if self._external_features:
            self._artifact._external_features = self._external_features
        self._artifact.save()
        return annotate_artifact(  # type: ignore
            self._artifact,
            curator=self,
            cat_vectors=cat_vectors,
        )


def convert_dict_to_dataframe_for_validation(d: dict, schema: Schema) -> pd.DataFrame:
    """Convert a dictionary to a DataFrame for validation against a schema."""
    df = pd.DataFrame([d])
    for feature in schema.members:
        # we cannot cast a `list[cat[...]]]` to categorical because lists are not hashable
        if feature.dtype_as_str.startswith("cat"):
            if feature.name in df.columns:
                value = df.loc[0, feature.name]
                if isinstance(value, (list, SQLRecordList, set, BasicQuerySet)):
                    df.attrs[feature.name] = "list_of_categories"
                else:
                    if isinstance(value, SQLRecord) and value._state.adding:
                        raise ValidationError(
                            f"{value.__class__.__name__} {getattr(value, getattr(value, 'name_field', 'name'), value.uid)} is not saved."
                        )
                    df[feature.name] = pd.Categorical(df[feature.name])
    return df


# For more context, read https://laminlabs.slack.com/archives/C07DB677JF6/p1753994077716099 and
# https://www.notion.so/laminlabs/Add-a-DictCurator-2422aeaa55e180b9a513f91d13970836
class ComponentCurator(Curator):
    """Curator for `DataFrame`.

    Provides all key functionality to validate Pandas DataFrames.
    This class is not user facing unlike :class:`~lamindb.curators.DataFrameCurator` which extends this
    class with functionality to validate the `attrs` slot.

    Args:
        dataset: The DataFrame-like object to validate & annotate.
        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
        slot: Indicate the slot in a composite curator for a composite data structure.
    """

    def __init__(
        self,
        dataset: pd.DataFrame | Artifact,
        schema: Schema,
        slot: str | None = None,
        require_saved_schema: bool = True,
    ) -> None:
        super().__init__(
            dataset=dataset, schema=schema, require_saved_schema=require_saved_schema
        )

        categoricals = []
        features = []
        feature_ids: set[int] = set()

        if schema.flexible:
            features += Feature.filter(name__in=self._dataset.keys()).to_list()
            feature_ids = {feature.id for feature in features}

        if schema.n_members and schema.n_members > 0:
            if schema._index_feature_uid is not None:
                schema_features = [
                    feature
                    for feature in schema.members.to_list()
                    if feature.uid != schema._index_feature_uid  # type: ignore
                ]
            else:
                schema_features = schema.members.to_list()  # type: ignore
            if feature_ids:
                features.extend(
                    feature
                    for feature in schema_features
                    if feature.id not in feature_ids  # type: ignore
                )
            else:
                features.extend(schema_features)
        else:
            assert schema.itype is not None  # noqa: S101

        pandera_columns = {}
        self._pandera_schema = None
        if features or schema._index_feature_uid is not None:
            # populate features
            if schema.minimal_set:
                optional_feature_uids = set(schema.optionals.get_uids())
            for feature in features:
                if schema.minimal_set:
                    required = feature.uid not in optional_feature_uids
                else:
                    required = False
                # series.dtype is "object" if the column has lists types, e.g. [["a", "b"], ["a"], ["b"]]
                dtype_str = feature._dtype_str
                if (
                    dtype_str.startswith("list[cat")
                    or self._dataset.attrs.get(feature.name) == "list_of_categories"
                ):
                    pandera_columns[feature.name] = pandera.Column(
                        dtype=None,
                        checks=pandera.Check(
                            check_dtype("list", feature.nullable),
                            element_wise=False,
                            error=f"Column '{feature.name}' failed dtype check for '{dtype_str}' against (list, nullable={feature.nullable})",
                        ),
                        nullable=feature.nullable,
                        coerce=feature.coerce,
                        required=required,
                    )
                elif dtype_str in {
                    "int",
                    "float",
                    "bool",
                    "num",
                    "path",
                    "url",
                } or dtype_str.startswith("list"):
                    if isinstance(self._dataset, pd.DataFrame):
                        dtype = (
                            self._dataset[feature.name].dtype
                            if feature.name in self._dataset.keys()
                            else None
                        )
                    else:
                        dtype = None
                    pandera_columns[feature.name] = pandera.Column(
                        dtype=None,
                        checks=pandera.Check(
                            check_dtype(dtype_str, feature.nullable),
                            element_wise=False,
                            error=f"Column '{feature.name}' failed dtype check for '{dtype_str}': got {dtype}",
                        ),
                        nullable=feature.nullable,
                        coerce=feature.coerce,
                        required=required,
                    )
                elif dtype_str == "dict":
                    pandera_columns[feature.name] = pandera.Column(
                        dtype=object,
                        nullable=feature.nullable,
                        coerce=feature.coerce,
                        required=required,
                        checks=pandera.Check(
                            lambda s: s.dropna()
                            .apply(lambda x: isinstance(x, dict))
                            .all(),
                            error="Non-null values must be dicts",
                        ),
                    )
                else:
                    pandera_dtype = (
                        dtype_str if not dtype_str.startswith("cat") else "category"
                    )
                    pandera_columns[feature.name] = pandera.Column(
                        pandera_dtype,
                        nullable=feature.nullable,
                        coerce=feature.coerce,
                        required=required,
                    )
                if dtype_str.startswith("cat") or dtype_str.startswith("list[cat["):
                    # validate categoricals if the column is required or if the column is present
                    # but exclude the index feature from column categoricals
                    if (required or feature.name in self._dataset.keys()) and (
                        schema._index_feature_uid is None
                        or feature.uid != schema._index_feature_uid
                    ):
                        categoricals.append(feature)
            # in almost no case, an index should have a pandas.CategoricalDtype in a DataFrame
            # so, we're typing it as `str` here
            if schema.index is not None:
                index = pandera.Index(
                    schema.index._dtype_str
                    if not schema.index._dtype_str.startswith("cat")
                    else str
                )
            else:
                index = None
            if schema.maximal_set:
                # allow any columns starting with "__lamindb" even if maximal_set is True
                pandera_columns[LAMINDB_COLUMN_PREFIX_REGEX] = pandera.Column(
                    regex=True, required=False, nullable=True
                )
            self._pandera_schema = pandera.DataFrameSchema(
                pandera_columns,
                coerce=schema.coerce,
                strict=schema.maximal_set,
                ordered=schema.ordered_set,
                index=index,
            )
        if (
            schema.itype == "Composite"
        ):  # backward compat, should be migrated to Feature.name
            columns_field = Feature.name
        else:
            columns_field = parse_cat_dtype(schema.itype, is_itype=True)["field"]
        # in the DataFrameCatManager, we use the
        # actual columns of the dataset, not the pandera columns
        # the pandera columns might have additional optional columns
        self._cat_manager = DataFrameCatManager(
            self._dataset,
            columns_field=columns_field,
            categoricals=categoricals,
            index=schema.index,
            slot=slot,
            maximal_set=schema.maximal_set,
            schema=schema,
        )

    @property
    @doc_args(CAT_MANAGER_DOCSTRING)
    def cat(self) -> DataFrameCatManager:
        """{}"""  # noqa: D415
        return self._cat_manager

    def standardize(self) -> None:
        """Standardize the dataset.

        - Adds missing columns for features
        - Fills missing values for features with default values
        """
        if self._artifact is not None:
            raise RuntimeError(
                "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
            )

        for feature in self._schema.members:
            if feature.name not in self._dataset.columns:
                if feature.default_value is not None or feature.nullable:
                    fill_value = (
                        feature.default_value
                        if feature.default_value is not None
                        else pd.NA
                    )
                    dtype_str = feature._dtype_str
                    if dtype_str.startswith("cat"):
                        self._dataset[feature.name] = pd.Categorical(
                            [fill_value] * len(self._dataset)
                        )
                    else:
                        self._dataset[feature.name] = fill_value
                    logger.important(
                        f"added column {feature.name} with fill value {fill_value}"
                    )
                else:
                    raise ValidationError(
                        f"Missing column {feature.name} cannot be added because is not nullable and has no default value"
                    )
            else:
                if feature.default_value is not None:
                    if isinstance(
                        self._dataset[feature.name].dtype, pd.CategoricalDtype
                    ):
                        if (
                            feature.default_value
                            not in self._dataset[feature.name].cat.categories
                        ):
                            self._dataset[feature.name] = self._dataset[
                                feature.name
                            ].cat.add_categories(feature.default_value)
                    self._dataset[feature.name] = self._dataset[feature.name].fillna(
                        feature.default_value
                    )

    def _cat_manager_validate(self) -> None:
        self.cat.validate()

        if self.cat._is_validated:
            self._is_validated = True
        else:
            self._is_validated = False
            raise ValidationError(self.cat._validate_category_error_messages)

    @doc_args(VALIDATE_DOCSTRING)
    def validate(self) -> None:
        """{}"""  # noqa: D415
        if self._pandera_schema is not None:
            try:
                # first validate through pandera
                self._pandera_schema.validate(self._dataset, lazy=True)
                # then validate lamindb categoricals
                self._cat_manager_validate()
            except (pandera.errors.SchemaError, pandera.errors.SchemaErrors) as err:
                self._is_validated = False
                has_dtype_error = "WRONG_DATATYPE" in str(err)
                error_msg = str(err)
                if has_dtype_error:
                    error_msg += "   ▶ Hint: Consider setting `feature.coerce = True` to attempt coercing values during validation to the required dtype."
                raise ValidationError(error_msg) from err
        else:
            self._cat_manager_validate()


class DataFrameCurator(SlotsCurator):
    # the example in the docstring is tested in test_curators_quickstart_example
    """Curator for `DataFrame`.

    Args:
        dataset: The DataFrame-like object to validate & annotate.
        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
        slot: Indicate the slot in a composite curator for a composite data structure.
        require_saved_schema: Whether the schema must be saved before curation.

    Examples:

        For a simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_dataframe`.

        Here is an example that enforces a minimal set of columns in the dataframe.

        .. literalinclude:: scripts/curate_dataframe_minimal_errors.py
            :language: python

        Under-the-hood, this used the following schema.

        .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
            :language: python

        Valid features & labels were defined as:

        .. literalinclude:: scripts/define_mini_immuno_features_labels.py
            :language: python

        It is also possible to curate the `attrs` slot.

        .. literalinclude:: scripts/curate_dataframe_attrs.py
            :language: python
    """

    def __init__(
        self,
        dataset: pd.DataFrame | Artifact,
        schema: Schema,
        *,
        slot: str | None = None,
        features: dict[str, Any] | None = None,
        require_saved_schema: bool = True,
    ) -> None:
        # loads or opens dataset, dataset may be an artifact
        super().__init__(
            dataset=dataset,
            schema=schema,
            features=features,
            require_saved_schema=require_saved_schema,
        )
        # uses open dataset at self._dataset
        self._atomic_curator = ComponentCurator(
            dataset=self._dataset,
            schema=schema,
            slot=slot,
            require_saved_schema=require_saved_schema,
        )
        # Handle (nested) attrs
        if slot is None and schema.slots:
            for slot_name, slot_schema in schema.slots.items():
                if slot_name.startswith("attrs"):
                    path_parts = slot_name.split(":")
                    attrs_dict = getattr(self._dataset, "attrs", None)
                    if attrs_dict is not None:
                        if len(path_parts) == 1:
                            data = attrs_dict
                        else:
                            deeper_keys = path_parts[1:]
                            data = _resolve_schema_slot_path(
                                attrs_dict, deeper_keys, slot_name, "attrs"
                            )
                        df = convert_dict_to_dataframe_for_validation(data, slot_schema)
                        self._slots[slot_name] = ComponentCurator(
                            df,
                            slot_schema,
                            slot=slot_name,
                            require_saved_schema=require_saved_schema,
                        )
                elif slot_name != "__external__":
                    raise ValueError(
                        f"Slot '{slot_name}' is not supported for DataFrameCurator. Must be 'attrs'."
                    )

    @property
    def cat(self) -> DataFrameCatManager:
        """Manage categoricals by updating registries."""
        return self._atomic_curator.cat

    def standardize(self) -> None:
        """Standardize the dataset.

        - Adds missing columns for features
        - Fills missing values for features with default values
        """
        self._atomic_curator.standardize()
        for slot_curator in self._slots.values():
            slot_curator.standardize()

    @doc_args(VALIDATE_DOCSTRING)
    def validate(self) -> None:
        """{}."""
        self._atomic_curator.validate()
        self._is_validated = self._atomic_curator._is_validated
        super().validate()

    @doc_args(SAVE_ARTIFACT_DOCSTRING)
    def save_artifact(
        self, *, key=None, description=None, revises=None, run=None
    ) -> Artifact:
        """{}."""
        if not self._is_validated:
            self.validate()
        self._slots["columns"] = self._atomic_curator
        try:
            return super().save_artifact(
                key=key, description=description, revises=revises, run=run
            )
        finally:
            del self._slots["columns"]


class ExperimentalDictCurator(DataFrameCurator):
    """Curator for `dict` based on `DataFrameCurator`."""

    def __init__(
        self,
        dataset: dict | Artifact,
        schema: Schema,
        slot: str | None = None,
        require_saved_schema: bool = False,
    ) -> None:
        if not isinstance(dataset, dict) and not isinstance(dataset, Artifact):
            raise InvalidArgument("The dataset must be a dict or dict-like artifact.")
        if isinstance(dataset, Artifact):
            assert dataset.otype == "dict", "Artifact must be of otype 'dict'."  # noqa: S101
            d = dataset.load(is_run_input=False)
        else:
            d = dataset
        df = convert_dict_to_dataframe_for_validation(d, schema)  # type: ignore
        super().__init__(
            df, schema, slot=slot, require_saved_schema=require_saved_schema
        )


def _resolve_schema_slot_path(
    target_dict: dict[str, Any], slot_keys: Iterable[str], slot: str, base_path: str
) -> Any:
    """Resolve a schema slot path by traversing nested dictionary keys.

    Args:
        target_dict: Root dictionary to traverse
        slot_keys: Sequence of keys defining the paths to traverse
        slot_name: Schema slot identifier for error context
        base_path: Base path string for error context

    Returns:
        The value at the resolved path
    """
    current = target_dict

    for key in slot_keys:
        base_path += f"['{key}']"
        try:
            current = current[key]
        except (
            KeyError,
            TypeError,
        ):  # if not a dict, raises TypeError; if a dict and key not found, raises KeyError
            available = (
                list(current.keys())
                if isinstance(current, dict)
                else "none (not a dict)"
            )
            raise InvalidArgument(
                f"Schema slot '{slot}' requires keys {base_path} but key '{key}' "
                f"not found. Available keys at this level: {available}."
            ) from None

    return current


def _handle_dict_slots(
    dataset: ScverseDataStructures, slot: str
) -> tuple[pd.DataFrame | None, str | None, str | None]:
    """Handle dict-based slot paths (uns/attrs standalone or of modalities) for all ScverseCurators.

    Supports two patterns:
        - Direct dict access: "uns", "attrs", "uns:key1:key2", "attrs:key"
        - Modality dict access: "modality:uns"

    Args:
        dataset: The scverse datastructure object
        slot: The slot path string to parse like 'uns:path:to'.

    Returns:
        tuple: (dataframe, modality_key, remaining_slot_path)
            - dataframe: Single-row DataFrame containing the resolved data
            - modality_key: Modality identifier if slot targets modality dict, else None
            - remaining_slot_path: The dict attribute and nested keys as string
    """
    path_parts = slot.split(":")

    # Handle direct dict slots: "uns", "attrs", "uns:key1:key2:..."
    if len(path_parts) >= 1 and path_parts[0] in ["uns", "attrs"]:
        dict_attr = getattr(dataset, path_parts[0], None)
        if dict_attr is not None:
            if len(path_parts) == 1:
                return pd.DataFrame([dict_attr]), None, path_parts[0]

            deeper_keys = path_parts[1:]
            data = _resolve_schema_slot_path(
                dict_attr, deeper_keys, slot, path_parts[0]
            )
            return pd.DataFrame([data]), None, ":".join(path_parts[1:])

    # Handle modality dict slots: "modality:uns", "modality:uns:key1:key2"
    elif len(path_parts) >= 2 and path_parts[1] in ["uns", "attrs"]:
        modality, dict_name = path_parts[0], path_parts[1]
        try:
            modality_dataset = dataset[modality]
            dict_attr = getattr(modality_dataset, dict_name, None)
            if dict_attr is not None:
                if len(path_parts) == 2:
                    return pd.DataFrame([dict_attr]), modality, dict_name

                deeper_keys = path_parts[2:]
                data = _resolve_schema_slot_path(
                    dict_attr, deeper_keys, slot, f"{modality}.{dict_name}"
                )
                return pd.DataFrame([data]), modality, ":".join(path_parts[1:])
        except (KeyError, AttributeError):
            pass
    else:
        raise InvalidArgument(
            f"Invalid dict slot pattern '{slot}'. Expected formats: "
            f"'uns', 'attrs', 'uns:key', 'attrs:key', 'modality:uns'"
        )

    return None, None, None


@doc_args(SLOTS_DETAILS_DOCSTRING)
class AnnDataCurator(SlotsCurator):
    """Curator for `AnnData`.

    {}

    Args:
        dataset: The AnnData-like object to validate & annotate.
        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.

    Examples:

        Curate Ensembl gene IDs and valid features in obs:

        .. literalinclude:: scripts/curate_anndata_flexible.py
            :language: python
            :caption: curate_anndata_flexible.py

        Curate `uns` dictionary:

        .. literalinclude:: scripts/curate_anndata_uns.py
            :language: python
            :caption: curate_anndata_uns.py

    See Also:
        :meth:`~lamindb.Artifact.from_anndata`.
    """

    def __init__(
        self,
        dataset: AnnData | Artifact,
        schema: Schema,
    ) -> None:
        super().__init__(dataset=dataset, schema=schema)
        if not data_is_scversedatastructure(self._dataset, "AnnData"):
            raise InvalidArgument("dataset must be AnnData-like.")
        if schema.otype != "AnnData":
            raise InvalidArgument("Schema otype must be 'AnnData'.")

        for slot, slot_schema in schema.slots.items():
            if slot not in {"var", "var.T", "obs"} and not slot.startswith("uns"):
                raise ValueError(
                    f"AnnDataCurator currently only supports the slots 'var', 'var.T', 'obs', and 'uns', not {slot}"
                )
            if slot.startswith("uns"):
                df, _, _ = _handle_dict_slots(self._dataset, slot)
            elif slot in {"obs", "var", "var.T"}:
                df = (
                    getattr(self._dataset, slot.strip(".T")).T
                    if slot == "var.T"
                    or (
                        slot == "var"
                        and schema.slots["var"].itype not in {None, "Feature"}
                    )
                    else getattr(self._dataset, slot)
                )
            self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)

            # Handle var index naming for backward compat
            if slot == "var" and schema.slots["var"].itype not in {None, "Feature"}:
                logger.warning(
                    "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
                )
                self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
                    "var"
                ].cat._cat_vectors.pop("columns")
                self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"


def _assign_var_fields_categoricals_multimodal(
    modality: str | None,
    slot_type: str,
    slot: str,
    slot_schema: Schema,
    var_fields: dict[str, FieldAttr],
    cat_vectors: dict[str, dict[str, CatVector]],
    slots: dict[str, ComponentCurator],
) -> None:
    """Assigns var_fields and categoricals for multimodal data curators."""
    if modality is not None:
        var_fields[modality] = None
        cat_vectors[modality] = {}

    if slot_type == "var":
        var_field = parse_cat_dtype(slot_schema.itype, is_itype=True)["field"]
        if modality is None:
            # This should rarely/never be used since tables should have different var fields
            var_fields[slot] = var_field  # pragma: no cover
        else:
            # Note that this is NOT nested since the nested key is always "var"
            var_fields[modality] = var_field
    else:
        obs_fields = slots[slot].cat._cat_vectors
        if modality is None:
            cat_vectors[slot] = obs_fields
        else:
            # Note that this is NOT nested since the nested key is always "obs"
            cat_vectors[modality] = obs_fields


@doc_args(SLOTS_DETAILS_DOCSTRING)
class MuDataCurator(SlotsCurator):
    """Curator for `MuData`.

    {}

    Args:
        dataset: The MuData-like object to validate & annotate.
        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.

    Example:
        .. literalinclude:: scripts/curate_mudata.py
            :language: python
            :caption: curate_mudata.py

    See Also:
        :meth:`~lamindb.Artifact.from_mudata`.
    """

    def __init__(
        self,
        dataset: MuData | Artifact,
        schema: Schema,
    ) -> None:
        super().__init__(dataset=dataset, schema=schema)
        if not data_is_scversedatastructure(self._dataset, "MuData"):
            raise InvalidArgument("dataset must be MuData-like.")
        if schema.otype != "MuData":
            raise InvalidArgument("Schema otype must be 'MuData'.")

        for slot, slot_schema in schema.slots.items():
            # Handle slots: "mdata.uns", "modality:uns"
            if "uns" in slot:
                df, modality, modality_slot = _handle_dict_slots(self._dataset, slot)
            else:
                # Handle slots: "modality:obs", "modality:var"
                parts = slot.split(":")
                if len(parts) == 2:
                    modality, modality_slot = parts
                    try:
                        schema_dataset = self._dataset[modality]
                        df = getattr(schema_dataset, modality_slot.rstrip(".T"))
                    except KeyError:
                        raise InvalidArgument(
                            f"Modality '{modality}' not found in MuData"
                        ) from None
                    except AttributeError:
                        raise InvalidArgument(
                            f"Attribute '{modality_slot}' not found on modality '{modality}'"
                        ) from None
                else:
                    # Handle slots: "mdata:obs", "mdata:var" (uns is a dictionary and gets handled above)
                    modality, modality_slot = None, slot
                    schema_dataset = self._dataset
                    df = getattr(schema_dataset, modality_slot.rstrip(".T"))

            # Transpose var if necessary
            if modality_slot == "var" and schema.slots[slot].itype not in {
                None,
                "Feature",
            }:
                logger.warning(
                    "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
                )
                df = df.T
            elif modality_slot == "var.T":
                df = df.T

            self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)

            _assign_var_fields_categoricals_multimodal(
                modality=modality,
                slot_type=modality_slot,
                slot=slot,
                slot_schema=slot_schema,
                var_fields=self._var_fields,
                cat_vectors=self._cat_vectors,
                slots=self._slots,
            )

        self._columns_field = self._var_fields


@doc_args(SLOTS_DETAILS_DOCSTRING)
class SpatialDataCurator(SlotsCurator):
    """Curator for `SpatialData`.

    {}

    Args:
        dataset: The SpatialData-like object to validate & annotate.
        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.

    Example:
        .. literalinclude:: scripts/curate_spatialdata.py
            :language: python
            :caption: curate_spatialdata.py

    See Also:
        :meth:`~lamindb.Artifact.from_spatialdata`.
    """

    def __init__(
        self,
        dataset: SpatialData | Artifact,
        schema: Schema,
    ) -> None:
        super().__init__(dataset=dataset, schema=schema)
        if not data_is_scversedatastructure(self._dataset, "SpatialData"):
            raise InvalidArgument("dataset must be SpatialData-like.")
        if schema.otype != "SpatialData":
            raise InvalidArgument("Schema otype must be 'SpatialData'.")

        for slot, slot_schema in schema.slots.items():
            # Handle slots: "sdata:attrs"
            if slot.startswith("attrs"):
                df, table_key, table_slot = _handle_dict_slots(self._dataset, slot)
            else:
                parts = slot.split(":")
                # Handle slots: "tables:table_key:obs", "tables:table_key:var"
                if len(parts) == 3 and parts[0] == "tables":
                    table_key, table_slot = parts[1], parts[2]
                    try:
                        slot_object = self._dataset.tables[table_key]
                        df = getattr(slot_object, table_slot.rstrip(".T"))
                    except KeyError:
                        raise InvalidArgument(
                            f"Table '{table_key}' not found in sdata.tables"
                        ) from None
                    except AttributeError:
                        raise InvalidArgument(
                            f"Attribute '{table_slot}' not found on table '{table_key}'"
                        ) from None
                else:
                    # Handle legacy single keys for backward compatibility
                    if len(parts) == 1 and parts[0] != "attrs":
                        logger.warning(
                            f"please prefix slot {slot} with 'attrs:' going forward"
                        )
                        try:
                            df = pd.DataFrame([self._dataset.attrs[slot]])
                            table_key = None
                            table_slot = slot
                        except KeyError:
                            raise InvalidArgument(
                                f"Slot '{slot}' not found in sdata.attrs"
                            ) from None
                    else:
                        raise InvalidArgument(f"Unrecognized slot format: {slot}")

            # Handle var transposition logic
            if table_slot == "var" and schema.slots[slot].itype not in {
                None,
                "Feature",
            }:
                logger.warning(
                    "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
                )
                df = df.T
            elif table_slot == "var.T":
                df = df.T

            self._slots[slot] = ComponentCurator(df, slot_schema, slot)

            _assign_var_fields_categoricals_multimodal(
                modality=table_key,
                slot_type=table_slot,
                slot=slot,
                slot_schema=slot_schema,
                var_fields=self._var_fields,
                cat_vectors=self._cat_vectors,
                slots=self._slots,
            )

        self._columns_field = self._var_fields


@doc_args(SLOTS_DETAILS_DOCSTRING)
class TiledbsomaExperimentCurator(SlotsCurator):
    """Curator for `tiledbsoma.Experiment`.

    {}

    Args:
        dataset: The `tiledbsoma.Experiment` object.
        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.

    Example:

        .. literalinclude:: scripts/curate_soma_experiment.py
            :language: python
            :caption: curate_soma_experiment.py

    See Also:
        :meth:`~lamindb.Artifact.from_tiledbsoma`.
    """

    def __init__(
        self,
        dataset: SOMAExperiment | Artifact,
        schema: Schema,
    ) -> None:
        super().__init__(dataset=dataset, schema=schema)
        if not data_is_soma_experiment(self._dataset):
            raise InvalidArgument("dataset must be SOMAExperiment-like.")
        if schema.otype != "tiledbsoma":
            raise InvalidArgument("Schema otype must be 'tiledbsoma'.")

        for slot, slot_schema in schema.slots.items():
            if slot.startswith("ms:"):
                _, modality_slot = slot.split(":")
                schema_dataset = (
                    self._dataset.ms[modality_slot.removesuffix(".T")]
                    .var.read()
                    .concat()
                    .to_pandas()
                    .drop("soma_joinid", axis=1, errors="ignore")
                )

                self._slots[slot] = ComponentCurator(
                    (schema_dataset.T if modality_slot == "var.T" else schema_dataset),
                    slot_schema,
                )
            else:
                # global Experiment obs slot
                modality_slot = slot
                schema_dataset = (
                    self._dataset.obs.read()
                    .concat()
                    .to_pandas()
                    .drop(["soma_joinid", "obs_id"], axis=1, errors="ignore")
                )
                self._slots[slot] = ComponentCurator(
                    schema_dataset,
                    slot_schema,
                )

            _assign_var_fields_categoricals_multimodal(
                modality=slot,  # not passing `measurement` here because it's a constant. The slot has the actual modality
                slot_type=modality_slot,
                slot=slot,
                slot_schema=slot_schema,
                var_fields=self._var_fields,
                cat_vectors=self._cat_vectors,
                slots=self._slots,
            )
        self._columns_field = self._var_fields


class CatVector:
    """Vector with categorical values."""

    def __init__(
        self,
        values_getter: Callable
        | Iterable[str],  # A callable or iterable that returns the values to validate.
        field: FieldAttr,  # The field to validate against.
        key: str,  # The name of the vector to validate. Only used for logging.
        values_setter: Callable | None = None,  # A callable that sets the values.
        source: SQLRecord | None = None,  # The ontology source to validate against.
        feature: Feature | None = None,
        cat_manager: DataFrameCatManager | None = None,
        filter_str: str = "",
        record_uid: str | None = None,
        maximal_set: bool = True,  # whether unvalidated categoricals cause validation failure.
        schema: Schema = None,
    ) -> None:
        self._values_getter = values_getter
        self._values_setter = values_setter
        self._field = field
        self._key = key
        self._source = source
        self._validated: None | list[str] = None
        self._non_validated: None | list[str] = None
        self._synonyms: None | dict[str, str] = None
        self._record_uid = record_uid
        self._subtype_query_set = None
        self._cat_manager = cat_manager
        self.feature = feature
        self.records = None
        self._maximal_set = maximal_set
        self._type_record = None
        self._registry = self._field.field.model
        self._field_name = self._field.field.name
        self._filter_kwargs = {}
        self._schema = schema
        if filter_str and filter_str != "unsaved":
            self._filter_kwargs.update(
                resolve_relation_filters(
                    parse_filter_string(filter_str), self._registry
                )  # type: ignore
            )
        if self._registry.__base__.__name__ == "BioRecord":
            if self._source is not None:
                self._filter_kwargs["source"] = self._source
            organism_record = get_organism_record_from_field(
                field=self._field,
                organism=self._filter_kwargs.get("organism"),
                values=self.values,
            )
            if organism_record is not None:
                self._filter_kwargs["organism"] = organism_record
        self._filter_kwargs = get_current_filter_kwargs(
            self._registry, self._filter_kwargs
        )

        # get the dtype associated record based on the record_uid
        if self._record_uid:
            self._type_record = get_record_type_from_uid(
                self._registry,
                self._record_uid,
            )

        if hasattr(self._registry, "_name_field"):
            label_ref_is_name = self._field_name == self._registry._name_field
        else:
            label_ref_is_name = self._field_name == "name"
        self.label_ref_is_name = label_ref_is_name

    @property
    def values(self):
        """Get the current values using the getter function."""
        if callable(self._values_getter):
            return self._values_getter()
        return self._values_getter

    @values.setter
    def values(self, new_values):
        """Set new values using the setter function if available."""
        if callable(self._values_setter):
            self._values_setter(new_values)
        else:
            # If values_getter is not callable, it's a direct reference we can update
            self._values_getter = new_values

    @property
    def is_validated(self) -> bool:
        """Whether the vector is validated."""
        # if nothing was validated, something likely is fundamentally wrong
        # should probably add a setting `at_least_one_validated`
        result = True
        if len(self.values) > 0 and len(self.values) == len(self._non_validated):
            logger.warning(f"no values were validated for {self._key}!")
        # len(self._non_validated) != 0
        #     if maximal_set is True, return False
        #     if maximal_set is False, return True
        # len(self._non_validated) == 0
        #     return True
        if len(self._non_validated) != 0:
            if self._maximal_set:
                result = False
        return result

    def _replace_synonyms(self) -> list[str]:
        """Replace synonyms in the vector with standardized values."""

        def process_value(value, syn_mapper):
            """Helper function to process values recursively."""
            if isinstance(value, list):
                # Handle list - recursively process each item
                return [process_value(item, syn_mapper) for item in value]
            else:
                # Handle single value
                return syn_mapper.get(value, value)

        syn_mapper = self._synonyms
        # replace the values in df
        std_values = self.values.map(
            lambda unstd_val: process_value(unstd_val, syn_mapper)
        )
        # remove the standardized values from self.non_validated
        non_validated = [i for i in self._non_validated if i not in syn_mapper]
        if len(non_validated) == 0:
            self._non_validated = []
        else:
            self._non_validated = non_validated  # type: ignore
        # logging
        n = len(syn_mapper)
        if n > 0:
            syn_mapper_print = _format_values(
                [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
            )
            s = "s" if n > 1 else ""
            logger.success(
                f'standardized {n} synonym{s} in "{self._key}": {colors.green(syn_mapper_print)}'
            )
        return std_values

    def __repr__(self) -> str:
        if self._non_validated is None:
            status = "unvalidated"
        else:
            status = (
                "validated"
                if len(self._non_validated) == 0
                else f"non-validated ({len(self._non_validated)})"
            )

        field_name = getattr(self._field, "name", str(self._field))
        values_count = len(self.values) if hasattr(self.values, "__len__") else "?"
        return f"CatVector(key='{self._key}', field='{field_name}', values={values_count}, {status})"

    def _add_validated(self) -> tuple[list, list]:
        """Save features or labels records in the default instance."""
        from lamindb.models.has_parents import keep_topmost_matches
        from lamindb.models.save import save as ln_save

        model_field = self._registry.__get_name_with_module__()

        values = [
            value
            for value in self.values
            if (isinstance(value, str) and value)
            or (
                isinstance(value, (int, float))
                and not isinstance(value, bool)
                and value == value
            )
            or (isinstance(value, list) and value)
            or (
                isinstance(value, np.ndarray) and value.size > 0 and value.dtype != bool
            )
        ]
        if not values:
            return [], []

        # if a value is a list, we need to flatten it
        str_values = _flatten_unique(values)

        # if values are SQLRecord, we don't need to validate them
        if all(isinstance(v, SQLRecord) for v in str_values):
            assert all(v._state.adding is False for v in str_values), (
                "All records must be saved."
            )
            self.records = str_values  # type: ignore
            validated_values = str_values  # type: ignore
            return validated_values, []

        # get all field specs for union types
        if self.feature:
            results = parse_dtype(self.feature._dtype_str)
        else:
            results = [None]

        all_validated = []
        all_records = []
        remaining_values = str_values

        for result in results:
            if not remaining_values:
                break  # pragma: no cover

            if result is not None:
                field = result["field"]
                registry = field.field.model
                field_name = field.field.name
                filter_kwargs: dict[str, str | SQLRecord] = {}
                filter_str = result.get("filter_str", "")
                if filter_str:
                    parsed_filters = parse_filter_string(filter_str)
                    filter_kwargs.update(
                        resolve_relation_filters(parsed_filters, registry)
                    )
                if registry.__base__.__name__ == "BioRecord":
                    organism_record = get_organism_record_from_field(
                        field=field,
                        organism=None,
                        values=remaining_values,
                    )
                    if organism_record is not None:
                        filter_kwargs["organism"] = organism_record
                # Merge in self._filter_kwargs (contains cat_filters from Feature)
                if self._filter_kwargs:
                    filter_kwargs.update(self._filter_kwargs)
                filter_kwargs = get_current_filter_kwargs(registry, filter_kwargs)
            else:
                field = self._field
                registry = self._registry
                field_name = self._field_name
                filter_kwargs = self._filter_kwargs

            # inspect the default instance and save validated records from public
            if issubclass(registry, HasType):
                if self._type_record is None:
                    # When we have a Schema with typed members,
                    # scope the query to the types present in the schema's members (plus untyped features)
                    # to avoid ambiguous matches across different feature types.
                    qs = registry.filter()
                    if self._schema and self._schema.n_members:
                        type_ids = {
                            m.type_id
                            for m in self._schema.members
                            if m.type_id is not None
                        }
                        if type_ids:
                            qs = registry.filter(
                                Q(type_id__in=type_ids) | Q(type_id__isnull=True)
                            )
                    self._subtype_query_set = qs
                else:
                    query_sub_types = getattr(
                        self._type_record, f"query_{registry.__name__.lower()}s"
                    )
                    self._subtype_query_set = query_sub_types()
                subtype_query_set = (
                    self._subtype_query_set.filter(**filter_kwargs)
                    if filter_kwargs
                    else self._subtype_query_set
                )
                values_array = np.array(remaining_values)
                validated_mask = subtype_query_set.validate(
                    values_array, field=field, mute=True
                )
                validated_values, non_validated_values = (
                    list(set(values_array[validated_mask])),
                    list(set(values_array[~validated_mask])),
                )
                records = subtype_query_set.filter(
                    **{f"{field_name}__in": validated_values}
                ).to_list()
                records = keep_topmost_matches(records)
            else:
                existing_and_public_records = _from_values(
                    remaining_values,
                    field=field,
                    mute=True,
                    **filter_kwargs,  # type: ignore
                )
                existing_and_public_values = [
                    getattr(r, field_name) for r in existing_and_public_records
                ]
                # public records that are not already in the database
                public_records = [
                    r for r in existing_and_public_records if r._state.adding
                ]
                if len(public_records) > 0:
                    logger.info(f"saving validated records of '{self._key}'")
                    ln_save(public_records)
                    values_saved_public = [
                        getattr(r, field_name) for r in public_records
                    ]
                    # log the saved public labels
                    # the term "transferred" stresses that this is always in the context of transferring
                    # labels from a public ontology or a different instance to the present instance
                    if len(values_saved_public) > 0:
                        s = "s" if len(values_saved_public) > 1 else ""
                        logger.success(
                            f'added {len(values_saved_public)} record{s} {colors.green("from_public")} with {model_field} for "{self._key}": {_format_values(values_saved_public)}'
                        )
                        # non-validated records from the default instance
                non_validated_values = [
                    i for i in remaining_values if i not in existing_and_public_values
                ]
                validated_values = existing_and_public_values
                records = existing_and_public_records

            all_validated.extend(validated_values)
            all_records.extend(records)
            remaining_values = non_validated_values

        self.records = all_records
        # validated values, non-validated values
        return all_validated, remaining_values

    def _add_new(
        self,
        values: list[str],
        df: pd.DataFrame | None = None,  # remove when all users use schema
        dtype: str | None = None,
        **create_kwargs,
    ) -> None:
        """Add new labels to the registry."""
        from lamindb.models.save import save as ln_save

        non_validated_records: SQLRecordList[Any] = []  # type: ignore
        if df is not None and self._registry == Feature:
            nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
            non_validated_records = Feature.from_dataframe(df.loc[:, nonval_columns])
        else:
            organism_record = self._filter_kwargs.get("organism", None)
            for value in values:
                init_kwargs = {self._field_name: value}
                if self._registry == Feature:
                    init_kwargs["dtype"] = "cat" if dtype is None else dtype
                if self._type_record is not None:
                    # if type_record is set, we need to set the type for new records
                    init_kwargs["type"] = self._type_record
                if organism_record is not None:
                    init_kwargs["organism"] = organism_record
                # here we create non-validated records skipping validation since we already ensured that they don't exist
                non_validated_records.append(
                    self._registry(
                        **init_kwargs, **create_kwargs, _skip_validation=True
                    )
                )
        if len(non_validated_records) > 0:
            ln_save(non_validated_records)
            model_field = colors.italic(self._registry.__get_name_with_module__())
            s = "s" if len(values) > 1 else ""
            logger.success(
                f'added {len(values)} record{s} with {model_field} for "{self._key}": {_format_values(values)}'
            )

    def _validate(
        self,
        values: list[str],
    ) -> tuple[list[str], dict]:
        """Validate ontology terms using LaminDB registries."""
        model_field = f"{self._registry.__name__}.{self._field_name}"

        # get all field specs for union types
        if self.feature:
            results = parse_dtype(self.feature._dtype_str)
        else:
            results = [{"field": self._field}]

        non_validated = values
        syn_mapper: dict[str, str] = {}

        for result in results:
            if not non_validated:
                break
            field = result["field"]
            registry = field.field.model
            filter_kwargs = self._filter_kwargs.copy()
            filter_str = result.get("filter_str", "")
            if filter_str:
                parsed_filters = parse_filter_string(filter_str)
                filter_kwargs.update(resolve_relation_filters(parsed_filters, registry))
            registry_or_queryset = registry
            if self._subtype_query_set is not None and registry == self._registry:
                registry_or_queryset = self._subtype_query_set
            # first inspect against the registry
            inspect_result = registry_or_queryset.filter(**filter_kwargs).inspect(
                non_validated,
                field=field,
                mute=True,
                from_source=False,
            )
            # here non_validated includes synonyms and new values
            non_validated = inspect_result.non_validated
            syn_mapper.update(inspect_result.synonyms_mapper)

        # logging messages
        if self._cat_manager is not None:
            slot = self._cat_manager._slot
        else:
            slot = None
        in_slot = f" in slot '{slot}'" if slot is not None else ""
        slot_prefix = f".slots['{slot}']" if slot is not None else ""
        non_validated_hint_print = (
            f"curator{slot_prefix}.cat.add_new_from('{self._key}')"
        )
        n_non_validated = len(non_validated)
        if n_non_validated == 0:
            logger.success(
                f'"{self._key}" is validated against {colors.italic(model_field)}'
            )
            return [], {}
        else:
            s = "" if n_non_validated == 1 else "s"
            print_values = _format_values(non_validated)
            warning_message = f"{colors.red(f'{n_non_validated} term{s}')} not validated in feature '{self._key}'{in_slot}: {colors.red(print_values)}\n"
            # log synonyms if any
            if syn_mapper:
                s = "" if len(syn_mapper) == 1 else "s"
                syn_mapper_print = _format_values(
                    [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
                )
                hint_msg = f'.standardize("{self._key}")'
                warning_message += f"    {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n    → curate synonyms via: {colors.cyan(hint_msg)}"
            if n_non_validated > len(syn_mapper):
                if syn_mapper:
                    warning_message += "\n    for remaining terms:\n"
                check_organism = ""
                if (
                    self._registry.__base__.__name__ == "BioRecord"
                    and self._registry.require_organism(field=self._field)
                ):
                    organism = self._filter_kwargs.get("organism", None)
                    check_organism = f"fix organism '{organism}', "
                warning_message += f"    → {check_organism}fix typos, remove non-existent values, or save terms via: {colors.cyan(non_validated_hint_print)}"
                if self._subtype_query_set is not None and self._type_record:
                    warning_message += f"\n    → a valid label for subtype '{self._type_record.name}' has to be one of {self._subtype_query_set.to_list('name')}"
            logger.info(f'mapping "{self._key}" on {colors.italic(model_field)}')
            logger.warning(warning_message)
            if self._cat_manager is not None:
                self._cat_manager._validate_category_error_messages = strip_ansi_codes(
                    warning_message
                )
            return non_validated, syn_mapper

    def validate(self) -> None:
        """Validate the vector."""
        # add source-validated values to the registry
        self._validated, self._non_validated = self._add_validated()
        self._non_validated, self._synonyms = self._validate(values=self._non_validated)

    def standardize(self) -> None:
        """Standardize the vector."""
        if not hasattr(self._registry, "standardize"):
            return self.values
        if self._synonyms is None:
            self.validate()
        # get standardized values
        std_values = self._replace_synonyms()
        # update non_validated values
        self._non_validated = [
            i for i in self._non_validated if i not in self._synonyms.keys()
        ]
        # remove synonyms since they are now standardized
        self._synonyms = {}
        # update the values with the standardized values
        self.values = std_values

    def add_new(self, **create_kwargs) -> None:
        """Add new values to the registry."""
        if self._non_validated is None:
            self.validate()
        if len(self._synonyms) > 0:
            # raise error because .standardize modifies the input dataset
            raise ValidationError(
                "Please run `.standardize()` before adding new values."
            )
        self._add_new(
            values=self._non_validated,
            **create_kwargs,
        )
        # remove the non_validated values since they are now registered
        self._non_validated = []


class DataFrameCatManager:
    """Manage categoricals by updating registries.

    This class is accessible from within a `DataFrameCurator` via the `.cat` attribute.

    If you find non-validated values, you have two options:

    - new values found in the data can be registered via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.core.DataFrameCatManager.add_new_from`
    - non-validated values can be accessed via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.core.DataFrameCatManager.non_validated` and addressed manually
    """

    def __init__(
        self,
        df: pd.DataFrame | Artifact,
        columns_field: FieldAttr = Feature.name,
        categoricals: list[Feature] | None = None,
        sources: dict[str, SQLRecord] | None = None,
        index: Feature | None = None,
        slot: str | None = None,
        maximal_set: bool = False,
        schema: Schema | None = None,
    ) -> None:
        self._non_validated = None
        self._index = index
        self._artifact: Artifact = None  # pass the dataset as an artifact
        self._dataset: Any = df  # pass the dataset as an AnyPathStr or data object
        if isinstance(self._dataset, Artifact):
            self._artifact = self._dataset
            self._dataset = self._dataset.load(is_run_input=False)
        self._is_validated: bool = False
        self._categoricals = categoricals or []
        self._non_validated = None
        self._sources = sources or {}
        self._columns_field = columns_field
        self._validate_category_error_messages: str = ""
        self._cat_vectors: dict[str, CatVector] = {}
        self._slot = slot
        self._maximal_set = maximal_set
        columns = self._dataset.keys()
        if maximal_set:
            columns = [
                col for col in columns if not re.match(LAMINDB_COLUMN_PREFIX_REGEX, col)
            ]
        self._cat_vectors["columns"] = CatVector(
            values_getter=lambda: columns,  # lambda ensures the inplace update
            values_setter=lambda new_values: setattr(
                self._dataset, "columns", pd.Index(new_values)
            )
            if isinstance(self._dataset, pd.DataFrame)
            else None,
            field=columns_field,
            key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
            source=self._sources.get("columns"),
            cat_manager=self,
            maximal_set=self._maximal_set,
            filter_str=""
            if schema.flexible
            else "unsaved"
            if schema.id is None
            else f"schemas__id={schema.id}",
            schema=schema,
        )
        for feature in self._categoricals:
            result = parse_dtype(feature._dtype_str)[0]
            key = feature.name
            # only create CatVector if the key exists in the DataFrame
            if key in self._dataset.columns:
                self._cat_vectors[key] = CatVector(
                    values_getter=lambda k=key: self._dataset[
                        k
                    ],  # Capture key as default argument
                    values_setter=lambda new_values, k=key: self._dataset.__setitem__(
                        k, new_values
                    ),
                    field=result["field"],
                    key=key,
                    source=self._sources.get(key),
                    feature=feature,
                    cat_manager=self,
                    filter_str=result["filter_str"],
                    record_uid=result.get("record_uid"),
                )
        if index is not None and index._dtype_str.startswith("cat"):
            result = parse_dtype(index._dtype_str)[0]
            key = "index"
            self._cat_vectors[key] = CatVector(
                values_getter=self._dataset.index,
                values_setter=lambda new_values: setattr(
                    self._dataset, "index", new_values
                ),
                field=result["field"],
                key=key,
                feature=index,
                cat_manager=self,
                filter_str=result["filter_str"],
                record_uid=result.get("record_uid"),
            )

    @property
    def non_validated(self) -> dict[str, list[str]]:
        """Return the non-validated features and labels."""
        if self._non_validated is None:
            raise ValidationError("Please run validate() first!")
        return {
            key: cat_vector._non_validated
            for key, cat_vector in self._cat_vectors.items()
            if cat_vector._non_validated and key != "columns"
        }

    @property
    def categoricals(self) -> list[Feature]:
        """The categorical features."""
        return self._categoricals

    def __repr__(self) -> str:
        cls_name = colors.green(self.__class__.__name__)

        status_str = (
            f"{colors.green('validated')}"
            if self._is_validated
            else f"{colors.yellow('unvalidated')}"
        )

        info_parts = []

        cat_count = len(self._categoricals)
        if cat_count > 0:
            info_parts.append(f"categorical_features={cat_count}")

        if self._slot:
            info_parts.append(f"slot: {colors.italic(self._slot)}")

        info_str = ", ".join(info_parts)
        if info_str:
            return f"{cls_name}({info_str}, {status_str})"
        else:
            return f"{cls_name}({status_str})"

    def lookup(self, public: bool = False) -> CatLookup:
        """Lookup categories.

        Args:
            public: If "public", the lookup is performed on the public reference.
        """
        return CatLookup(
            categoricals=self._categoricals,
            slots={"columns": self._columns_field},
            public=public,
            sources=self._sources,
        )

    def validate(self) -> bool:
        """Validate variables and categorical observations."""
        self._validate_category_error_messages = ""  # reset the error messages
        validated = True
        for key, cat_vector in self._cat_vectors.items():
            logger.info(f"validating vector {key}")
            cat_vector.validate()
            validated &= cat_vector.is_validated
        self._is_validated = validated
        self._non_validated = {}  # type: ignore

        if self._index is not None:
            # cat_vector.validate() populates validated labels
            # the index should become part of the feature set corresponding to the dataframe
            if self._cat_vectors["columns"].records is not None:
                self._cat_vectors["columns"].records.insert(0, self._index)  # type: ignore
            else:
                self._cat_vectors["columns"].records = [self._index]  # type: ignore

        return self._is_validated

    def standardize(self, key: str) -> None:
        """Replace synonyms with standardized values.

        Modifies the input dataset inplace.

        Args:
            key: The key referencing the column in the DataFrame to standardize.
        """
        if self._artifact is not None:
            raise RuntimeError(
                "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
            )

        if key == "all":
            logger.warning(
                "'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!"
            )
            for k in self.non_validated.keys():
                self._cat_vectors[k].standardize()
        else:
            self._cat_vectors[key].standardize()

    def add_new_from(self, key: str, **kwargs):
        """Add validated & new categories.

        Args:
            key: The key referencing the slot in the DataFrame from which to draw terms.
            **kwargs: Additional keyword arguments to pass to create new records
        """
        if len(kwargs) > 0 and key == "all":
            raise ValueError("Cannot pass additional arguments to 'all' key!")
        if key == "all":
            logger.warning(
                "'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!"
            )
            for k in self.non_validated.keys():
                self._cat_vectors[k].add_new(**kwargs)
        else:
            self._cat_vectors[key].add_new(**kwargs)


def get_current_filter_kwargs(
    registry: type[SQLRecord], kwargs: dict[str, str | SQLRecord]
) -> dict:
    """Make sure the source and organism are saved in the same database as the registry."""
    db = registry.filter().db
    filter_kwargs = kwargs.copy()

    for key, value in kwargs.items():
        if isinstance(value, SQLRecord) and value._state.db != "default":
            if db is None or db == "default":
                value_default = copy.copy(value)
                value_default.save()
                filter_kwargs[key] = value_default

    return filter_kwargs


def annotate_artifact(
    artifact: Artifact,
    *,
    curator: SlotsCurator | None = None,
    cat_vectors: dict[str, CatVector] | None = None,
) -> Artifact:
    from .. import settings
    from ..models.artifact import add_labels
    from ..models.schema import ArtifactSchema

    if cat_vectors is None:
        cat_vectors = {}

    # annotate with labels
    for key, cat_vector in cat_vectors.items():
        if (
            cat_vector._registry == Feature
            or key == "columns"
            or key == "var_index"
            or cat_vector.records is None
        ):
            continue
        if len(cat_vector.records) > settings.annotation.n_max_records:
            logger.important(
                f"not annotating with {len(cat_vector.records)} labels for feature {key} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
            )
            continue
        add_labels(
            artifact,
            records=cat_vector.records,
            feature=cat_vector.feature,
            from_curator=True,
        )

    # annotate with inferred schemas aka feature sets
    if (
        artifact.otype == "DataFrame" and getattr(curator, "_schema", None) is None
    ):  # Prevent overwriting user-defined schemas that contain slots
        features = cat_vectors["columns"].records
        if features is not None:
            index_feature = artifact.schema.index
            index_feature_id = None if index_feature is None else index_feature.id
            feature_set = Schema(
                features=[
                    f
                    for f in features
                    if index_feature_id is None or f.id != index_feature_id
                ],
                itype=artifact.schema.itype,
                index=index_feature,
                minimal_set=artifact.schema.minimal_set,
                maximal_set=artifact.schema.maximal_set,
                coerce=artifact.schema.coerce,
                ordered_set=artifact.schema.ordered_set,
            )
            if (
                feature_set._state.adding
                and len(features) > settings.annotation.n_max_records
            ):
                logger.important(
                    f"not annotating with {len(features)} features as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
                )
                itype = (
                    Feature.name
                    if artifact.schema.itype == "Composite"  # backward compat
                    else parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
                )
                feature_set = Schema(itype=itype, n_members=len(features))

            ArtifactSchema.objects.update_or_create(
                artifact=artifact,
                slot="columns",
                defaults={"schema": feature_set.save()},
            )

    else:
        for slot, slot_curator in curator._slots.items():
            # var_index is backward compat (2025-05-01)
            name = (
                "var_index"
                if (slot == "var" and "var_index" in slot_curator.cat._cat_vectors)
                else "columns"
            )
            features = slot_curator.cat._cat_vectors[name].records
            if features is None:
                logger.warning(f"no features found for slot {slot}")
                continue
            validating_schema = slot_curator._schema
            index_feature = validating_schema.index
            index_feature_id = None if index_feature is None else index_feature.id
            feature_set = Schema(
                features=[
                    f
                    for f in features
                    if index_feature_id is None or f.id != index_feature_id
                ],
                itype=validating_schema.itype,
                index=index_feature,
                minimal_set=validating_schema.minimal_set,
                maximal_set=validating_schema.maximal_set,
                coerce=validating_schema.coerce,
                ordered_set=validating_schema.ordered_set,
            )
            if (
                feature_set._state.adding
                and len(features) > settings.annotation.n_max_records
            ):
                logger.important(
                    f"not annotating with {len(features)} features for slot {slot} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
                )
                itype = (
                    Feature.name
                    if artifact.schema.slots[slot].itype
                    == "Composite"  # backward compat
                    else parse_cat_dtype(
                        artifact.schema.slots[slot].itype, is_itype=True
                    )["field"]
                )
                feature_set = Schema(itype=itype, n_members=len(features))
            ArtifactSchema.objects.update_or_create(
                artifact=artifact, slot=slot, defaults={"schema": feature_set.save()}
            )

    slug = ln_setup.settings.instance.slug
    if ln_setup.settings.instance.is_remote:  # pdagma: no cover
        ui_url = ln_setup.settings.instance.ui_url
        logger.important(f"go to {ui_url}/{slug}/artifact/{artifact.uid}")
    return artifact


def _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]:
    """Flatten a Pandas series containing lists or single items into a unique list of elements.

    The order of elements in the result list preserves the order they first appear in the input series.
    """
    # Use dict.fromkeys to preserve order while ensuring uniqueness
    result: dict = {}

    for item in series:
        if isinstance(item, list | np.ndarray):
            # Add each element to the dict (only first occurrence is kept)
            for element in item:
                result[element] = None
        else:
            result[item] = None

    # Return the keys as a list, preserving order
    return list(result.keys())


================================================
FILE: lamindb/errors.py
================================================
"""Errors.

Django.

.. autoexception:: ObjectDoesNotExist
.. autoexception:: MultipleObjectsReturned

LaminDB.

.. autoexception:: ValidationError
.. autoexception:: InvalidArgument
.. autoexception:: NotebookNotSaved
.. autoexception:: UnknownStorageLocation
.. autoexception:: MissingContextUID
.. autoexception:: UpdateContext
.. autoexception:: IntegrityError
.. autoexception:: FieldValidationError
.. autoexception:: NoWriteAccess
.. autoexception:: BlobHashNotFound
.. autoexception:: FileNotInDevDir
.. autoexception:: BranchAlreadyExists

"""

# -------------------------------------------------------------------------------------
# Django
# -------------------------------------------------------------------------------------

from django.core.exceptions import (
    MultipleObjectsReturned,  # noqa: F401
    ObjectDoesNotExist,  # noqa: F401
)

ObjectDoesNotExist.__doc__ = """Object does not exist.

This is an alias for `django.core.exceptions.ObjectDoesNotExist`.
"""
DoesNotExist = ObjectDoesNotExist  # backward compat

MultipleObjectsReturned.__doc__ = """Multiple objects returned.

This is an alias for `django.core.exceptions.MultipleObjectsReturned`.
"""
MultipleResultsFound = MultipleObjectsReturned  # backward compat

# -------------------------------------------------------------------------------------
# lamindb
# -------------------------------------------------------------------------------------


class ValidationError(Exception):
    """Validation error."""

    pass


class InvalidArgument(Exception):
    """Invalid method or function argument."""

    pass


class TrackNotCalled(Exception):
    """`ln.track()` wasn't called."""

    pass


class NotebookNotSaved(Exception):
    """Notebook wasn't saved."""

    pass


class UnknownStorageLocation(Exception):
    """Path is not contained in any known storage location."""

    pass


class NoStorageLocationForSpace(Exception):
    """No storage location found for space."""

    pass


class InconsistentKey(Exception):
    """Inconsistent transform or artifact `key`."""

    pass


class FieldValidationError(Exception):
    """Field validation error."""

    pass


# -------------------------------------------------------------------------------------
# run context
# -------------------------------------------------------------------------------------


class IntegrityError(Exception):
    """Integrity error.

    For instance, it's not allowed to delete artifacts outside managed storage
    locations.
    """

    pass


class MissingContextUID(Exception):
    """User didn't define transform settings."""

    pass


class UpdateContext(Exception):
    """Transform settings require update."""

    pass


class BlobHashNotFound(Exception):
    """Blob hash not found in git or storage."""

    pass


# -------------------------------------------------------------------------------------
# CRUD
# -------------------------------------------------------------------------------------


class NoWriteAccess(Exception):
    """No write access to a space."""

    pass


class FileNotInDevDir(Exception):
    """File path is not within the configured dev directory."""

    pass


class BranchAlreadyExists(Exception):
    """Branch already exists.

    Raised when creating a branch with `ln.setup.switch(..., create=True)` and
    a branch with the given name or uid already exists. Consistent with `git switch -c`.
    """

    pass


================================================
FILE: lamindb/examples/__init__.py
================================================
"""Examples.

.. autosummary::
   :toctree: .

   schemas
   datasets
   cellxgene
   croissant
   mlflow
   wandb

"""

from . import croissant, datasets, mlflow, schemas, wandb
from .cellxgene import _cellxgene


================================================
FILE: lamindb/examples/cellxgene/__init__.py
================================================
"""CELLxGENE utilities.

.. autofunction:: save_cellxgene_defaults
.. autofunction:: create_cellxgene_schema

"""

from ._cellxgene import (
    create_cellxgene_schema,
    save_cellxgene_defaults,
)


================================================
FILE: lamindb/examples/cellxgene/_cellxgene.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Collection, Literal, NamedTuple

if TYPE_CHECKING:
    from lamindb.base.types import FieldAttr
    from lamindb.models import Registry, Schema

CELLxGENEOrganisms = Literal[
    "human",
    "mouse",
    "zebra danio",
    "rhesus macaquedomestic pig",
    "chimpanzee",
    "white-tufted-ear marmoset",
    "sars-2",
]
FieldType = Literal["ontology_id", "name"]


def save_cellxgene_defaults() -> None:
    """Save default values of the CELLxGENE schema to the instance.

    Adds CELLxGENE specific (control) values that are not available in the ontologies:

    - "normal" Disease
    - "na" Ethnicity
    - "unknown" entries for DevelopmentalStage, Phenotype, and CellType
    - "tissue", "organoid", "primary cell culture", and "cell line" ULabels (tissue_type)
    - "cell", "nucleus", "na" ULabels (suspension_type)
    """
    import bionty as bt

    from lamindb.models import ULabel

    # "normal" in Disease
    normal = bt.Phenotype.from_source(
        ontology_id="PATO:0000461",
        source=bt.Source.get(name="pato", currently_used=True),
    )
    bt.Disease(
        uid=normal.uid,
        name=normal.name,
        ontology_id=normal.ontology_id,
        description=normal.description,
        source=normal.source,  # not sure
    ).save()

    # na, unknown
    for model, name in zip(
        [
            bt.Ethnicity,
            bt.Ethnicity,
            bt.DevelopmentalStage,
            bt.Phenotype,
            bt.CellType,
        ],
        ["na", "unknown", "unknown", "unknown", "unknown"],
    ):
        model(ontology_id=name, name=name, description="From CellxGene schema.").save()

    # tissue_type
    tissue_type = ULabel(
        name="TissueType",
        is_type=True,
        description='From CellxGene schema. Is "tissue", "organoid", "primary cell culture", or "cell line".',
    ).save()
    for name in ["tissue", "organoid", "primary cell culture", "cell line"]:
        ULabel(name=name, type=tissue_type, description="From CellxGene schema.").save()

    # suspension_type
    suspension_type = ULabel(
        name="SuspensionType",
        is_type=True,
        description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
    ).save()
    for name in ["cell", "nucleus", "na"]:
        ULabel(
            name=name, type=suspension_type, description="From CellxGene schema."
        ).save()

    # organisms
    taxonomy_ids = [
        "NCBITaxon:9606",  # Homo sapiens (Human)
        "NCBITaxon:10090",  # Mus musculus (House mouse)
        "NCBITaxon:9544",  # Macaca mulatta (Rhesus monkey)
        "NCBITaxon:9825",  # Sus scrofa domesticus (Domestic pig)
        "NCBITaxon:9598",  # Pan troglodytes (Chimpanzee)
        "NCBITaxon:9483",  # Callithrix jacchus (White-tufted-ear marmoset)
        "NCBITaxon:7955",  # Danio rerio (Zebrafish)
    ]
    for ontology_id in taxonomy_ids:
        bt.Organism.from_source(
            ontology_id=ontology_id,
            source=bt.Source.get(name="ncbitaxon", currently_used=True),
        ).save()


def create_cellxgene_schema(
    *,
    field_types: FieldType | Collection[FieldType] = "ontology_id",
    spatial_library_id: str | None = None,
    organism: CELLxGENEOrganisms = "human",
) -> Schema:
    """Generates a :class:`~lamindb.Schema` for a specific CELLxGENE schema version.

    Args:
        field_types: One or several of 'ontology_id', 'name'.
        organism: The organism of the Schema.
        library_id: Identifier for the spatial library.
            Specifying this value enables curation against spatial requirements.
    """
    import bionty as bt

    from lamindb.models import Feature, Schema, ULabel

    class CategorySpec(NamedTuple):
        field: str | FieldAttr | list[Registry]
        default: str | None
        needs_organism: bool = False

    categoricals_to_spec: dict[str, CategorySpec] = {
        "assay": CategorySpec(bt.ExperimentalFactor.name, None, False),
        "assay_ontology_term_id": CategorySpec(
            bt.ExperimentalFactor.ontology_id, None, False
        ),
        "cell_type": CategorySpec(bt.CellType.name, "unknown", False),
        "cell_type_ontology_term_id": CategorySpec(
            bt.CellType.ontology_id, None, False
        ),
        "development_stage": CategorySpec(bt.DevelopmentalStage.name, "unknown", True),
        "development_stage_ontology_term_id": CategorySpec(
            bt.DevelopmentalStage.ontology_id, None, True
        ),
        "disease": CategorySpec(bt.Disease.name, "normal", False),
        "disease_ontology_term_id": CategorySpec(bt.Disease.ontology_id, None, False),
        "self_reported_ethnicity": CategorySpec(bt.Ethnicity.name, "unknown", False),
        "self_reported_ethnicity_ontology_term_id": CategorySpec(
            bt.Ethnicity.ontology_id, None, False
        ),
        "sex": CategorySpec(bt.Phenotype.name, "unknown", False),
        "sex_ontology_term_id": CategorySpec(bt.Phenotype.ontology_id, None, False),
        "suspension_type": CategorySpec(ULabel.name, "cell", False),
        "tissue": CategorySpec(bt.Tissue.name, None, False),
        "tissue_ontology_term_id": CategorySpec(
            [bt.Tissue.ontology_id, bt.CellType.ontology_id], None, False
        ),
        "tissue_type": CategorySpec(ULabel.name, "tissue", False),
        "organism": CategorySpec(bt.Organism.scientific_name, None, False),
        "organism_ontology_term_id": CategorySpec(bt.Organism.ontology_id, None, False),
        "donor_id": CategorySpec(str, "unknown", False),
    }

    def _get_source_cat_filters(
        field: str | FieldAttr | type[Registry], *, needs_organism: bool | None = None
    ) -> dict | None:
        """Some ontology are organism specific and their Features therefore need a `cat_filter`."""
        if isinstance(field, str) or not needs_organism:
            return None
        registry = field.field.model if hasattr(field, "field") else field
        entity = f"bionty.{registry.__name__}"
        filters = {"entity": entity, "currently_used": True}
        if needs_organism:
            filters["organism"] = organism
        return {"source": bt.Source.filter(**filters).one()}

    field_types_set = (
        {field_types} if isinstance(field_types, str) else set(field_types)
    )
    if field_types_set == {"ontology_id"}:
        categoricals = {
            k: v.field
            for k, v in categoricals_to_spec.items()
            if k.endswith("_ontology_term_id") or k == "donor_id"
        }
    elif field_types_set == {"name"}:
        categoricals = {
            k: v.field
            for k, v in categoricals_to_spec.items()
            if not k.endswith("_ontology_term_id") and k != "donor_id"
        }
    elif field_types_set == {"name", "ontology_id"}:
        categoricals = {k: v.field for k, v in categoricals_to_spec.items()}
    else:
        raise ValueError(
            f"Invalid field_types: {field_types}. Must contain 'ontology_id', 'name', or both."
        )

    organism_fields = {"organism", "organism_ontology_term_id"}
    obs_categoricals = {
        k: v for k, v in categoricals.items() if k not in organism_fields
    }

    var_schema = Schema(
        name="var of CELLxGENE",
        index=Feature(
            name="var_index",
            dtype=bt.Gene.ensembl_gene_id,
            cat_filters=_get_source_cat_filters(
                bt.Gene.ensembl_gene_id, needs_organism=True
            ),
        ).save(),
        itype=Feature,
        features=[Feature(name="feature_is_filtered", dtype=bool).save()],
        dtype="DataFrame",
        coerce=True,
    ).save()

    obs_features = []
    for field in obs_categoricals:
        if field == "var_index":
            continue
        dtype = obs_categoricals[field]
        needs_organism = categoricals_to_spec[field].needs_organism

        cat_filters: dict | list[dict] | None
        if isinstance(dtype, list):
            cat_filters = (
                [
                    _get_source_cat_filters(d, needs_organism=needs_organism)
                    for d in dtype
                ]
                if needs_organism
                else None
            )
        elif not isinstance(dtype, str):
            cat_filters = _get_source_cat_filters(dtype, needs_organism=needs_organism)
        else:
            cat_filters = None

        obs_features.append(
            Feature(  # type: ignore
                name=field,
                dtype=dtype,
                default_value=categoricals_to_spec[field].default,
                cat_filters=cat_filters,  # type: ignore
            ).save()
        )

    for name in ["is_primary_data", "suspension_type", "tissue_type"]:
        obs_features.append(Feature(name=name, dtype=ULabel.name).save())

    obs_schema = Schema(
        name=f"obs of CELLxGENE of {field_types}",
        features=obs_features,
        otype="DataFrame",
        minimal_set=True,
        coerce=True,
    ).save()

    slots = {"var": var_schema, "obs": obs_schema}

    uns_categoricals = {k: v for k, v in categoricals.items() if k in organism_fields}

    uns_features = [
        Feature(
            name=field,
            dtype=uns_categoricals[field],
            default_value=categoricals_to_spec[field].default,
        ).save()
        for field in uns_categoricals
    ]

    uns_schema = Schema(
        name="uns of CELLxGENE version",
        features=uns_features,
        otype="DataFrame",
        minimal_set=True,
        coerce=True,
    ).save()

    slots["uns"] = uns_schema

    # Add spatial validation if library_id is provided
    if spatial_library_id:
        scalefactors_schema = Schema(
            name=f"scalefactors of spatial {spatial_library_id}",
            features=[
                Feature(name="spot_diameter_fullres", dtype=float).save(),
                Feature(name="tissue_hires_scalef", dtype=float).save(),
            ],
        ).save()

        spatial_schema = Schema(
            name="CELLxGENE spatial metadata",
            features=[
                Feature(
                    name="is_single",
                    dtype=bool,
                    description="True if dataset represents single spatial unit (tissue section for Visium, array for Slide-seqV2)",
                ).save()
            ],
        ).save()

        slots["uns:spatial"] = spatial_schema
        slots[f"uns:spatial:{spatial_library_id}:scalefactors"] = scalefactors_schema

    # Spatial library ID must be in the name
    # Otherwise, we have lookup side effects where other existing Spatial Library IDs make it into the Schema
    schema_name = f"CELLxGENE AnnData of {', '.join(field_types) if isinstance(field_types, list) else field_types}"
    if spatial_library_id:
        schema_name += f" ({spatial_library_id})"

    full_cxg_schema = Schema(
        name=schema_name,
        otype="AnnData",
        minimal_set=True,
        coerce=True,
        slots=slots,
    ).save()

    return full_cxg_schema


================================================
FILE: lamindb/examples/croissant/__init__.py
================================================
"""Examples for MLCommons Croissant files, which are used to store metadata about datasets.

.. autofunction:: mini_immuno

"""

import json
from pathlib import Path


def mini_immuno(
    n_files: int = 1, filepath_prefix: str = "", strip_version: bool = False
) -> list[Path]:
    """Return paths to the mini immuno dataset and its metadata as a Croissant file.

    Args:
        n_files: Number of files inside the croissant file.
        filepath_prefix: Move the dataset and references to it in a specific directory.

    Example

        ::

            croissant_path, dataset1_path = ln.examples.croissant.mini_immuno()
            croissant_path, dataset1_path, dataset2_path = ln.examples.croissant.mini_immuno(n_files=2)
    """
    from ..datasets import file_mini_csv
    from ..datasets.mini_immuno import get_dataset1

    adata = get_dataset1(otype="AnnData")
    if filepath_prefix:
        dataset1_path = Path(filepath_prefix) / "mini_immuno.anndata.zarr"
    else:
        dataset1_path = Path("mini_immuno.anndata.zarr")
    adata.write_zarr(dataset1_path)
    orig_croissant_path = (
        Path(__file__).parent / "mini_immuno.anndata.zarr_metadata.json"
    )
    with open(orig_croissant_path, encoding="utf-8") as f:
        data = json.load(f)
    if filepath_prefix:
        assert data["distribution"][0]["@id"] == "mini_immuno.anndata.zarr"  # noqa: S101
        data["distribution"][0]["@id"] = str(Path(filepath_prefix) / dataset1_path.name)
    if strip_version:
        data.pop("version", None)
    if n_files == 2:
        file_mini_csv()
        if filepath_prefix:
            dataset2_path = Path(filepath_prefix) / "mini.csv"
        else:
            dataset2_path = Path("mini.csv")
        data["distribution"].append(
            {
                "@type": "sc:FileObject",
                "@id": dataset2_path.as_posix(),
                "name": "mini.csv",
                "encodingFormat": "text/csv",
            }
        )
    croissant_path = Path("mini_immuno.anndata.zarr_metadata.json")
    with open(croissant_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

    result: list[Path] = [croissant_path, dataset1_path]
    if n_files == 1:
        return result
    result.append(dataset2_path)

    return result


================================================
FILE: lamindb/examples/croissant/mini_immuno.anndata.zarr_metadata.json
================================================
{
  "@context": {
    "@vocab": "https://schema.org/",
    "cr": "https://mlcommons.org/croissant/",
    "ml": "http://ml-schema.org/",
    "sc": "https://schema.org/",
    "dct": "http://purl.org/dc/terms/",
    "data": "https://mlcommons.org/croissant/data/",
    "rai": "https://mlcommons.org/croissant/rai/",
    "format": "https://mlcommons.org/croissant/format/",
    "citeAs": "https://mlcommons.org/croissant/citeAs/",
    "conformsTo": "https://mlcommons.org/croissant/conformsTo/",
    "@language": "en",
    "repeated": "https://mlcommons.org/croissant/repeated/",
    "field": "https://mlcommons.org/croissant/field/",
    "examples": "https://mlcommons.org/croissant/examples/",
    "recordSet": "https://mlcommons.org/croissant/recordSet/",
    "fileObject": "https://mlcommons.org/croissant/fileObject/",
    "fileSet": "https://mlcommons.org/croissant/fileSet/",
    "source": "https://mlcommons.org/croissant/source/",
    "references": "https://mlcommons.org/croissant/references/",
    "key": "https://mlcommons.org/croissant/key/",
    "parentField": "https://mlcommons.org/croissant/parentField/",
    "isLiveDataset": "https://mlcommons.org/croissant/isLiveDataset/",
    "separator": "https://mlcommons.org/croissant/separator/",
    "extract": "https://mlcommons.org/croissant/extract/",
    "subField": "https://mlcommons.org/croissant/subField/",
    "regex": "https://mlcommons.org/croissant/regex/",
    "column": "https://mlcommons.org/croissant/column/",
    "path": "https://mlcommons.org/croissant/path/",
    "fileProperty": "https://mlcommons.org/croissant/fileProperty/",
    "md5": "https://mlcommons.org/croissant/md5/",
    "jsonPath": "https://mlcommons.org/croissant/jsonPath/",
    "transform": "https://mlcommons.org/croissant/transform/",
    "replace": "https://mlcommons.org/croissant/replace/",
    "dataType": "https://mlcommons.org/croissant/dataType/",
    "includes": "https://mlcommons.org/croissant/includes/",
    "excludes": "https://mlcommons.org/croissant/excludes/"
  },
  "@type": "Dataset",
  "name": "Mini immuno dataset",
  "description": "A few samples from the immunology dataset",
  "url": "https://lamin.ai/laminlabs/lamindata/artifact/tCUkRcaEjTjhtozp0000",
  "creator": {
    "@type": "Person",
    "name": "falexwolf"
  },
  "dateCreated": "2025-07-16",
  "cr:projectName": "Mini Immuno Project",
  "datePublished": "2025-07-16",
  "version": "1.0",
  "license": "https://creativecommons.org/licenses/by/4.0/",
  "citation": "Please cite this dataset as: mini immuno (2025)",
  "encodingFormat": "zarr",
  "distribution": [
    {
      "@type": "cr:FileSet",
      "@id": "mini_immuno.anndata.zarr",
      "containedIn": {
        "@id": "directory"
      },
      "encodingFormat": "zarr"
    }
  ],
  "cr:recordSet": [
    {
      "@type": "cr:RecordSet",
      "@id": "#samples",
      "name": "samples",
      "description": "my sample"
    }
  ]
}


================================================
FILE: lamindb/examples/datasets/__init__.py
================================================
"""Example datasets.

The mini immuno dataset
-----------------------

.. autosummary::
   :toctree: .

   mini_immuno

Small in-memory datasets
------------------------

.. autofunction:: anndata_with_obs

Files
-----

.. autofunction:: file_fcs
.. autofunction:: file_fcs_alpert19
.. autofunction:: file_tsv_rnaseq_nfcore_salmon_merged_gene_counts
.. autofunction:: file_jpg_paradisi05
.. autofunction:: file_tiff_suo22
.. autofunction:: file_fastq
.. autofunction:: file_bam
.. autofunction:: file_mini_csv

Directories
-----------

.. autofunction:: dir_scrnaseq_cellranger
.. autofunction:: dir_iris_images

Dictionary, Dataframe, AnnData, MuData, SpatialData
----------------------------------------------------

.. autofunction:: dict_cellxgene_uns
.. autofunction:: df_iris
.. autofunction:: df_iris_in_meter
.. autofunction:: df_iris_in_meter_study1
.. autofunction:: df_iris_in_meter_study2
.. autofunction:: anndata_mouse_sc_lymph_node
.. autofunction:: anndata_human_immune_cells
.. autofunction:: anndata_pbmc68k_reduced
.. autofunction:: anndata_file_pbmc68k_test
.. autofunction:: anndata_pbmc3k_processed
.. autofunction:: anndata_suo22_Visium10X
.. autofunction:: anndata_visium_mouse_cellxgene
.. autofunction:: mudata_papalexi21_subset
.. autofunction:: schmidt22_crispra_gws_IFNG
.. autofunction:: schmidt22_perturbseq
.. autofunction:: spatialdata_blobs


Other
-----

.. autofunction:: fake_bio_notebook_titles
"""

import importlib.util
import sys
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from . import mini_immuno
    from ._core import (
        anndata_file_pbmc68k_test,
        anndata_human_immune_cells,
        anndata_mouse_sc_lymph_node,
        anndata_pbmc3k_processed,
        anndata_pbmc68k_reduced,
        anndata_suo22_Visium10X,
        anndata_visium_mouse_cellxgene,
        df_iris,
        df_iris_in_meter,
        df_iris_in_meter_study1,
        df_iris_in_meter_study2,
        dict_cellxgene_uns,
        dir_iris_images,
        dir_scrnaseq_cellranger,
        file_bam,
        file_fastq,
        file_fcs,
        file_fcs_alpert19,
        file_jpg_paradisi05,
        file_mini_csv,
        file_tiff_suo22,
        file_tsv_rnaseq_nfcore_salmon_merged_gene_counts,
        mudata_papalexi21_subset,
        schmidt22_crispra_gws_IFNG,
        schmidt22_perturbseq,
        spatialdata_blobs,
    )
    from ._fake import fake_bio_notebook_titles
    from ._small import anndata_with_obs, small_dataset3_cellxgene
    from .mini_immuno import get_dataset1 as small_dataset1
    from .mini_immuno import get_dataset2 as small_dataset2


def __getattr__(name: str):
    """Lazy-import datasets to avoid loading pandas/anndata at package import."""
    if name == "mini_immuno":
        # Use importlib to avoid __getattr__ recursion when importing submodule
        spec = importlib.util.find_spec(
            "lamindb.examples.datasets.mini_immuno",
            package="lamindb.examples.datasets",
        )
        if spec is None or spec.loader is None:
            raise ImportError("Could not find module mini_immuno")
        module = importlib.util.module_from_spec(spec)
        sys.modules["lamindb.examples.datasets.mini_immuno"] = module
        spec.loader.exec_module(module)
        return module
    if name in ("small_dataset1", "small_dataset2"):
        mini_immuno = importlib.import_module(
            ".mini_immuno", package="lamindb.examples.datasets"
        )
        return (
            mini_immuno.get_dataset1
            if name == "small_dataset1"
            else mini_immuno.get_dataset2
        )
    _core_names = (
        "anndata_file_pbmc68k_test",
        "anndata_human_immune_cells",
        "anndata_mouse_sc_lymph_node",
        "anndata_pbmc3k_processed",
        "anndata_pbmc68k_reduced",
        "anndata_suo22_Visium10X",
        "df_iris",
        "df_iris_in_meter",
        "df_iris_in_meter_study1",
        "df_iris_in_meter_study2",
        "dict_cellxgene_uns",
        "dir_iris_images",
        "dir_scrnaseq_cellranger",
        "file_bam",
        "file_fastq",
        "file_fcs",
        "file_fcs_alpert19",
        "file_jpg_paradisi05",
        "file_mini_csv",
        "file_tiff_suo22",
        "file_tsv_rnaseq_nfcore_salmon_merged_gene_counts",
        "mudata_papalexi21_subset",
        "schmidt22_crispra_gws_IFNG",
        "schmidt22_perturbseq",
        "spatialdata_blobs",
        "anndata_visium_mouse_cellxgene",
    )
    if name in _core_names:
        _core = importlib.import_module("._core", package="lamindb.examples.datasets")
        return getattr(_core, name)
    if name in ("anndata_with_obs", "small_dataset3_cellxgene"):
        _small = importlib.import_module("._small", package="lamindb.examples.datasets")
        return getattr(_small, name)
    if name == "fake_bio_notebook_titles":
        _fake = importlib.import_module("._fake", package="lamindb.examples.datasets")
        return _fake.fake_bio_notebook_titles
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


__all__ = [
    "mini_immuno",
    "small_dataset1",
    "small_dataset2",
    "small_dataset3_cellxgene",
    "anndata_with_obs",
    "anndata_file_pbmc68k_test",
    "anndata_human_immune_cells",
    "anndata_mouse_sc_lymph_node",
    "anndata_pbmc3k_processed",
    "anndata_pbmc68k_reduced",
    "anndata_suo22_Visium10X",
    "anndata_visium_mouse_cellxgene",
    "df_iris",
    "df_iris_in_meter",
    "df_iris_in_meter_study1",
    "df_iris_in_meter_study2",
    "dict_cellxgene_uns",
    "dir_iris_images",
    "dir_scrnaseq_cellranger",
    "fake_bio_notebook_titles",
    "file_bam",
    "file_fastq",
    "file_fcs",
    "file_fcs_alpert19",
    "file_jpg_paradisi05",
    "file_mini_csv",
    "file_tiff_suo22",
    "file_tsv_rnaseq_nfcore_salmon_merged_gene_counts",
    "mudata_papalexi21_subset",
    "schmidt22_crispra_gws_IFNG",
    "schmidt22_perturbseq",
    "spatialdata_blobs",
]


================================================
FILE: lamindb/examples/datasets/_core.py
================================================
from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING, Any
from urllib.request import urlretrieve

import anndata as ad
import pandas as pd
from upath import UPath

from lamindb.base.uids import base62
from lamindb.core._settings import settings

if TYPE_CHECKING:
    from mudata import MuData
    from spatialdata import SpatialData


def file_fcs() -> Path:
    """Example FCS artifact."""
    filepath, _ = urlretrieve(
        "https://lamindb-dev-datasets.s3.amazonaws.com/.lamindb/DBNEczSgBui0bbzBXMGH.fcs",
        "example.fcs",
    )
    return Path(filepath)


def file_fcs_alpert19(populate_registries: bool = False) -> Path:
    """FCS file from Alpert19.

    Args:
        populate_registries: pre-populate metadata records to simulate existing registries  # noqa
    """
    filepath, _ = urlretrieve(
        "https://lamindb-test.s3.amazonaws.com/Alpert19-070314-Mike-Study+15-2013-plate+1-15-004-1-13_cells_found.fcs",
        "Alpert19.fcs",
    )
    if populate_registries:
        import bionty as bt
        import readfcs

        import lamindb as ln

        verbosity = ln.settings.verbosity
        ln.settings.verbosity = "error"
        adata = readfcs.read(filepath)
        std = bt.CellMarker.public().standardize(adata.var.index)
        ln.save(
            bt.CellMarker.from_values(
                bt.CellMarker.public().inspect(std, "name").validated, "name"
            )
        )
        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
        ln.Feature(name="organism", dtype=[bt.Organism]).save()  # type: ignore
        ln.settings.verbosity = verbosity
    return Path(filepath)


def file_jpg_paradisi05() -> Path:
    """JPG file example.

    Originally from: https://upload.wikimedia.org/wikipedia/commons/2/28/Laminopathic_nuclei.jpg
    """
    filepath, _ = urlretrieve(
        "https://lamindb-test.s3.amazonaws.com/Laminopathic_nuclei.jpg",
        "paradisi05_laminopathic_nuclei.jpg",
    )
    return Path(filepath)


def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
    populate_registries: bool = False,
) -> Path:
    """Gene counts table from nf-core RNA-seq pipeline.

    Output of: https://nf-co.re/rnaseq
    """
    filepath, _ = urlretrieve(
        "https://lamindb-test.s3.amazonaws.com/salmon.merged.gene_counts.tsv",
        "salmon.merged.gene_counts.tsv",
    )
    if populate_registries:
        import bionty as bt

        import lamindb as ln

        verbosity = ln.settings.verbosity
        ln.settings.verbosity = "error"
        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
        ln.Feature(name="organism", dtype=[bt.Organism]).save()  # type: ignore
        bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save()
        ln.settings.verbosity = verbosity

    return Path(filepath)


def file_fastq(in_storage_root=False) -> Path:
    """Mini mock fastq artifact."""
    basedir = Path() if not in_storage_root else settings.storage.root
    filepath = basedir / "input.fastq.gz"
    with open(filepath, "w") as f:
        f.write("Mock fastq artifact.")
    return filepath


def file_bam(in_storage_root=False) -> Path:
    """Mini mock bam artifact."""
    basedir = Path() if not in_storage_root else settings.storage.root
    filepath = basedir / "output.bam"
    with open(filepath, "w") as f:
        f.write("Mock bam artifact.")
    return filepath


def file_mini_csv(in_storage_root=False) -> Path:
    """Mini csv artifact."""
    basedir = Path() if not in_storage_root else settings.storage.root
    filepath = basedir / "mini.csv"
    df = pd.DataFrame([1, 2, 3], columns=["test"])
    df.to_csv(filepath, index=False)
    return filepath


def file_tiff_suo22() -> Path:
    """Image file from Suo22.

    Pair with anndata_suo22_Visium10X
    """
    filepath, _ = urlretrieve(
        "https://lamindb-test.s3.amazonaws.com/F121_LP1_4LIV.tiff",
        "F121_LP1_4LIV.tiff",
    )
    Path("suo22/").mkdir(exist_ok=True)
    filepath = Path(filepath).rename("suo22/F121_LP1_4LIV.tiff")  # type: ignore
    return Path(filepath)


def dir_iris_images() -> UPath:
    """Directory with 3 studies of the Iris flower: 405 images & metadata.

    Provenance: https://lamin.ai/laminlabs/lamindata/transform/3q4MpQxRL2qZ5zKv

    The problem is that the same artifact was also ingested by the downstream demo notebook:
    https://lamin.ai/laminlabs/lamindata/transform/NJvdsWWbJlZS5zKv

    This is why on the UI, the artifact shows up as output of the downstream
    demo notebook rather than the upstream curation notebook.
    The lineage information should still be captured by
    https://github.com/laminlabs/lnschema-core/blob/a90437e91dfbd6b9002f18c3e978bd0f9c9a632d/lamindb/models.py#L2050-L2052
    but we don't use this in the UI yet.
    """
    return UPath("s3://lamindata/iris_studies")


def anndata_mouse_sc_lymph_node(
    populate_registries: bool = False,
) -> ad.AnnData:
    """Mouse lymph node scRNA-seq collection from EBI.

    Subsampled to 10k genes.

    From: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-8414/

    Args:
        populate_registries: pre-populate metadata records to simulate existing registries  # noqa
    """
    filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/E-MTAB-8414.h5ad")
    adata = ad.read_h5ad(filepath)

    # The column names are a bit lengthy, let's abbreviate them:
    adata.obs.columns = (
        adata.obs.columns.str.replace("Sample Characteristic", "")
        .str.replace("Factor Value ", "Factor Value:", regex=True)
        .str.replace("Factor Value\\[", "Factor Value:", regex=True)
        .str.replace(" Ontology Term\\[", "ontology_id:", regex=True)
        .str.strip("[]")
        .str.replace("organism part", "tissue")
        .str.replace("organism", "organism")
        .str.replace("developmental stage", "developmental_stage")
        .str.replace("cell type", "cell_type")
        # the last one could be interesting, too
        # .str.replace("Factor Value:Ontology Term[inferred cell_type - authors labels", "cell_type_authors")
    )
    # subset columns to only the ones with names
    columns = [
        col
        for col in adata.obs.columns
        if not col.startswith("ontology_id")
        and not col.startswith("Factor Value")
        and col != "strain"
    ]
    adata.obs = adata.obs[columns]

    # pre-populate registries
    if populate_registries:
        import bionty as bt

        import lamindb as ln

        verbosity = ln.settings.verbosity
        ln.settings.verbosity = "error"
        # strain
        bt.ExperimentalFactor.from_source(ontology_id="EFO:0004472").save()
        # developmental stage
        bt.ExperimentalFactor.from_source(ontology_id="EFO:0001272").save()
        # tissue
        bt.Tissue.from_source(ontology_id="UBERON:0001542").save()
        # cell types
        ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
        # assays
        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
        bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
        # genes
        validated = bt.Gene.public(organism="mouse").validate(
            adata.var.index, field="ensembl_gene_id"
        )
        ln.save(
            bt.Gene.from_values(
                adata.var.index[validated][:-19],
                field="ensembl_gene_id",
                organism="mouse",
            )
        )
        # labels
        labels = []
        for col in ["sex", "age", "genotype", "immunophenotype"]:
            labels += [ln.ULabel(name=name) for name in adata.obs[col]]
        ln.save(labels)
        ln.settings.verbosity = verbosity

    return adata


def anndata_pbmc68k_reduced() -> ad.AnnData:
    """Modified from scanpy.collections.pbmc68k_reduced().

    This code was run::

        pbmc68k = sc.collections.pbmc68k_reduced()
        pbmc68k.obs.rename(columns={"bulk_labels": "cell_type"}, inplace=True)
        pbmc68k.obs["cell_type"] = pbmc68k.obs["cell_type"].cat.rename_categories(
            {"Dendritic": "Dendritic cells", "CD14+ Monocyte": "CD14+ Monocytes"}
        )
        del pbmc68k.obs["G2M_score"]
        del pbmc68k.obs["S_score"]
        del pbmc68k.obs["phase"]
        del pbmc68k.obs["n_counts"]
        del pbmc68k.var["dispersions"]
        del pbmc68k.var["dispersions_norm"]
        del pbmc68k.var["means"]
        del pbmc68k.uns["rank_genes_groups"]
        del pbmc68k.uns["bulk_labels_colors"]
        sc.pp.subsample(pbmc68k, fraction=0.1, random_state=123)
        pbmc68k.write("scrnaseq_pbmc68k_tiny.h5ad")
    """
    filepath, _ = urlretrieve(
        "https://lamindb-dev-datasets.s3.amazonaws.com/scrnaseq_pbmc68k_tiny.h5ad"
    )
    return ad.read_h5ad(filepath)


def anndata_file_pbmc68k_test() -> Path:
    """Modified from scanpy.collections.pbmc68k_reduced().

    Additional slots were added for testing purposes. Returns the filepath.

    To reproduce::

        pbmc68k = ln.examples.datasets.anndata_pbmc68k_reduced()
        pbmc68k_test = pbmc68k[:30, :200].copy()
        pbmc68k_test.raw = pbmc68k_test[:, :100]
        pbmc68k_test.obsp["test"] = sparse.eye(pbmc68k_test.shape[0], format="csr")
        pbmc68k_test.varp["test"] = sparse.eye(pbmc68k_test.shape[1], format="csr")
        pbmc68k_test.layers["test"] = sparse.csr_matrix(pbmc68k_test.shape)
        pbmc68k_test.layers["test"][0] = 1.
        pbmc68k_test.write("pbmc68k_test.h5ad")
    """
    filepath, _ = urlretrieve(
        "https://lamindb-test.s3.amazonaws.com/pbmc68k_test.h5ad", "pbmc68k_test.h5ad"
    )
    return Path(filepath)


def anndata_pbmc3k_processed() -> ad.AnnData:
    """Modified from scanpy.pbmc3k_processed()."""
    filepath, _ = urlretrieve(
        "https://lamindb-test.s3.amazonaws.com/scrnaseq_scanpy_pbmc3k_processed.h5ad"
    )
    pbmc3k = ad.read_h5ad(filepath)
    pbmc3k.obs.rename(columns={"louvain": "cell_type"}, inplace=True)
    return pbmc3k


def anndata_human_immune_cells(
    populate_registries: bool = False,
) -> ad.AnnData:
    """Cross-tissue immune cell analysis reveals tissue-specific features in humans.

    From: https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3
    Collection: Global

    To reproduce the subsample::
        >>> adata = sc.read('Global.h5ad')
        >>> adata.obs = adata.obs[['donor_id', 'tissue', 'cell_type', 'assay', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id']].copy()
        >>> sc.pp.subsample(adata, fraction=0.005)
        >>> del adata.uns["development_cache_ontology_term_id_colors"]
        >>> del adata.uns["sex_ontology_term_id_colors"]
        >>> adata.write('human_immune.h5ad')
    """
    filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/human_immune.h5ad")
    adata = ad.read_h5ad(filepath)
    adata.var.drop(columns=["gene_symbols", "feature_name"], inplace=True)
    adata.uns.pop("cell_type_ontology_term_id_colors")
    adata.uns.pop("title")
    adata.uns.pop("schema_version")
    adata.obs.columns = adata.obs.columns.str.replace("donor_id", "donor")
    columns = [col for col in adata.obs.columns if "ontology_term" not in col]
    adata.obs = adata.obs[columns]
    if populate_registries:
        import bionty as bt

        import lamindb as ln

        ln.save(
            bt.Gene.from_values(
                adata.var.index, field="ensembl_gene_id", organism="human"
            )
        )
        ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name"))
        ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name"))
        ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name"))
        ln.Feature(name="cell_type", dtype=[bt.CellType]).save()  # type: ignore
        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
        ln.Feature(name="tissue", dtype=[bt.Tissue]).save()  # type: ignore
        ln.Feature(name="organism", dtype=[bt.Organism]).save()  # type: ignore
        ln.Feature(name="donor", dtype=[ln.ULabel]).save()  # type: ignore
        bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
        ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
    return adata


def anndata_suo22_Visium10X():
    """AnnData from Suo22 generated by 10x Visium."""
    import anndata as ad

    filepath, _ = urlretrieve(
        "https://lamindb-test.s3.amazonaws.com/suo22_Visium10X_data_LI_subset.h5ad",
        "Visium10X_data_LI_subset.h5ad",
    )
    Path("suo22/").mkdir(exist_ok=True)
    filepath = Path(filepath).rename("suo22/Visium10X_data_LI_subset.h5ad")
    return ad.read_h5ad(filepath)


def mudata_papalexi21_subset(with_uns: bool = False) -> MuData:
    """A subsetted MuData from papalexi21.

    To reproduce the subsetting:
        >>> !wget https://figshare.com/ndownloader/files/36509460
        >>> import mudata as md
        >>> import scanpy as sc
        >>> mdata = md.read_h5mu("36509460")
        >>> mdata = sc.pp.subsample(mdata, n_obs=200, copy=True)[0]
        >>> mdata[:, -300:].copy().write("papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu")
    """
    import mudata as md

    md.set_options(pull_on_update=False)

    filepath, _ = urlretrieve(
        "https://lamindb-test.s3.amazonaws.com/papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu",
        "papalexi21_subset.h5mu",
    )

    mdata = md.read_h5mu(filepath)

    mdata.pull_obs()

    # The MuData object is malformed with duplicated information
    # Drop all columns for the modalities and add them again correspondingly
    for mod in ["rna", "adt", "hto", "gdo"]:
        mdata[mod].obs.drop(mdata[mod].obs.columns, axis=1, inplace=True)
    for col in mdata.obs.columns:
        for mod in ["rna", "adt", "hto", "gdo"]:
            if col.endswith(f"_{mod.upper()}"):
                new_col = col.replace(f"{mod}:", "")
                if new_col != col:
                    mdata[mod].obs[new_col] = mdata.obs.pop(col)
            else:
                new_col = col.replace(f"{mod}:", "")
                if new_col not in mdata.obs.columns and col in mdata.obs.columns:
                    mdata.obs[new_col] = mdata.obs.pop(col)

    for col in mdata.obs.columns:
        for mod in ["rna", "adt", "hto", "gdo"]:
            if col.endswith(f"_{mod.upper()}"):
                del mdata.obs[col]

    for col in [
        "orig.ident",
        "MULTI_ID",
        "NT",
        "S.Score",
        "G2M.Score",
        "Phase",
        "gene_target",
        "guide_ID",
        "HTO_classification",
    ]:
        del mdata.obs[col]

    mdata.push_obs(["percent.mito"], mods=["rna"], drop=True)
    mdata["hto"].obs["technique"] = "cell hashing"
    mdata["hto"].obs["technique"] = mdata["hto"].obs["technique"].astype("category")
    mdata.pull_obs(["technique"], mods="hto")

    if with_uns:
        mdata.uns["study_metadata"] = {
            "temperature": 21.6,
            "experiment": "Experiment 1",
        }
        mdata["rna"].uns["site_metadata"] = {"pos": 99.9, "site_id": "SITE001"}

    return mdata


def dict_cellxgene_uns() -> dict[str, Any]:
    """An example CELLxGENE AnnData `.uns` dictionary."""
    uns = {
        "organism_ontology_term_id": "NCBITaxon:9606",
        "spatial": {
            "is_single": True,
            "library_1": {  # Dynamic library_id key
                "images": {
                    "fullres": "path/to/fullres.jpg",
                    "hires": "path/to/hires.jpg",
                },
                "scalefactors": {
                    "spot_diameter_fullres": 89.43,
                    "tissue_hires_scalef": 0.177,
                },
            },
            "library_2": {  # Another dynamic library_id key
                "images": {
                    "fullres": "path/to/fullres_2.jpg",
                    "hires": "path/to/hires_2.jpg",
                },
                "scalefactors": {
                    "spot_diameter_fullres": 120.34,
                    "tissue_hires_scalef": 0.355,
                },
            },
        },
    }

    return uns


def df_iris() -> pd.DataFrame:
    """The iris collection as in sklearn.

    Original code::

        sklearn.collections.load_iris(as_frame=True).frame
    """
    filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/iris.parquet")
    return pd.read_parquet(filepath)


def df_iris_in_meter() -> pd.DataFrame:
    """The iris collection with lengths in meter."""
    df = df_iris()
    # rename columns
    df.rename(
        columns={
            "sepal length (cm)": "sepal_length",
            "sepal width (cm)": "sepal_width",
            "petal length (cm)": "petal_length",
            "petal width (cm)": "petal_width",
        },
        inplace=True,
    )
    df[["sepal_length", "sepal_width", "petal_length", "petal_width"]] /= 100
    df["iris_organism_name"] = df["target"].map(
        {0: "setosa", 1: "versicolor", 2: "virginica"}
    )
    del df["target"]
    return df


def df_iris_in_meter_study1() -> pd.DataFrame:
    """The iris collection with lengths in meter."""
    df_iris = df_iris_in_meter()
    return df_iris.iloc[: len(df_iris) // 2]


def df_iris_in_meter_study2() -> pd.DataFrame:
    """The iris collection with lengths in meter."""
    df_iris = df_iris_in_meter()
    return df_iris.iloc[len(df_iris) // 2 :]


def dir_scrnaseq_cellranger(
    sample_name: str, basedir: str | Path = "./", output_only: bool = True
) -> Path:
    """Mock cell ranger outputs.

    Args:
        sample_name: name of the sample
        basedir: run directory
        output_only: only return output files
    """
    basedir = Path(basedir)

    if not output_only:
        fastqdir = basedir / "fastq"
        fastqdir.mkdir(parents=True, exist_ok=True)
        fastqfile1 = fastqdir / f"{sample_name}_R1_001.fastq.gz"
        with open(fastqfile1, "w") as f:
            f.write(f"{base62(n_char=6)}")
        fastqfile2 = fastqdir / f"{sample_name}_R2_001.fastq.gz"
        fastqfile2.touch(exist_ok=True)
        with open(fastqfile2, "w") as f:
            f.write(f"{base62(n_char=6)}")

    sampledir = basedir / f"{sample_name}"
    for folder in ["raw_feature_bc_matrix", "filtered_feature_bc_matrix", "analysis"]:
        filedir = sampledir / folder
        filedir.mkdir(parents=True, exist_ok=True)

    for filename in [
        "web_summary.html",
        "metrics_summary.csv",
        "possorted_genome_bam.bam",
        "possorted_genome_bam.bam.bai",
        "molecule_info.h5",
        "cloupe.cloupe",
        "raw_feature_bc_matrix.h5",
        "raw_feature_bc_matrix/barcodes.tsv.gz",
        "raw_feature_bc_matrix/features.tsv.gz",
        "raw_feature_bc_matrix/matrix.mtx.gz",
        "filtered_feature_bc_matrix.h5",
        "filtered_feature_bc_matrix/barcodes.tsv.gz",
        "filtered_feature_bc_matrix/features.tsv.gz",
        "filtered_feature_bc_matrix/matrix.mtx.gz",
        "analysis/analysis.csv",
    ]:
        file = sampledir / filename
        with open(file, "w") as f:
            f.write(f"{base62(n_char=6)}")

    return sampledir


def schmidt22_crispra_gws_IFNG(basedir=".") -> Path:
    """CRISPRi screen collection of Schmidt22.

    Originally from: https://zenodo.org/record/5784651
    """
    filepath, _ = urlretrieve(
        "https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv",
        "schmidt22-crispra-gws-IFNG.csv",
    )
    return Path(filepath).rename(Path(basedir) / filepath)


def schmidt22_perturbseq(basedir=".") -> Path:
    """Perturb-seq collection of Schmidt22.

    Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651

    To reproduce the subsample:
    >>> adata = sc.read('HuTcellsCRISPRaPerturbSeq_Re-stimulated.h5ad')
    >>> adata.obs = adata.obs[['cluster_name']]
    >>> del adata.obsp
    >>> del adata.var['features']
    >>> del adata.obsm['X_pca']
    >>> del adata.uns
    >>> del adata.raw
    >>> del adata.varm
    >>> adata.obs = adata.obs.reset_index()
    >>> del adata.obs['index']
    >>> sc.pp.subsample(adata, 0.03)
    >>> adata.write('schmidt22_perturbseq.h5ad')
    """
    filepath, _ = urlretrieve(
        "https://lamindb-test.s3.amazonaws.com/schmidt22_perturbseq.h5ad",
        "schmidt22_perturbseq.h5ad",
    )
    return Path(filepath).rename(Path(basedir) / filepath)


def anndata_visium_mouse_cellxgene() -> ad.AnnData:
    """Visium samples of thymus from wild type B6 mice 3-6 weeks old.

    The dataset is a CELLxGENE schema 7.0.0 validated dataset.
    """
    filepath, _ = urlretrieve(
        "https://datasets.cellxgene.cziscience.com/74f5c380-081f-41e4-9f05-346831fb67e8.h5ad",
        "zhang_2024_pcw56_visium.h5ad",
    )
    return ad.read_h5ad(filepath)


def spatialdata_blobs() -> SpatialData:
    """Example SpatialData dataset for tutorials."""
    from spatialdata.datasets import blobs

    sdata = blobs()
    sdata.attrs["bio"] = {
        "disease": "Alzheimer disease",
        "developmental_stage": "adult stage",
    }
    sdata.attrs["tech"] = {
        "assay": "Visium Spatial Gene Expression",
    }
    sdata.attrs["random_int"] = 20
    sdata.tables["table"].var.index = [
        "ENSG00000139618",  # BRCA2
        "ENSG00000157764",  # BRAF
        "ENSG00000999999",  # Does not exist
    ]
    sdata.tables["table"].obs["sample_region"] = pd.Categorical(
        ["sample region 1"] * 13 + ["sample region 2"] * 13
    )

    return sdata


================================================
FILE: lamindb/examples/datasets/_fake.py
================================================
from __future__ import annotations


def fake_bio_notebook_titles(n=100) -> list[str]:
    """A fake collection of study titles."""
    from faker import Faker

    fake = Faker()

    from faker_biology.mol_biol import Antibody
    from faker_biology.physiology import CellType, Organ, Organelle

    fake.add_provider(CellType)
    fake.add_provider(Organ)
    fake.add_provider(Organelle)
    fake.add_provider(Antibody)

    my_words = [
        "study",
        "investigate",
        "research",
        "result",
        "cluster",
        "rank",
        "candidate",
        "visualize",
        "efficiency",
        "classify",
    ]
    my_words += [fake.organ() for i in range(5)] + ["intestine", "intestinal"]
    my_words += [fake.celltype() for i in range(10)]
    my_words += [fake.antibody_isotype() for i in range(20)]

    my_notebook_titles = [fake.sentence(ext_word_list=my_words) for i in range(n)]

    return my_notebook_titles


================================================
FILE: lamindb/examples/datasets/_small.py
================================================
from __future__ import annotations

from typing import Any, Literal

import anndata as ad
import numpy as np
import pandas as pd


def small_dataset3_cellxgene(
    otype: Literal["DataFrame", "AnnData"] = "AnnData",
    *,
    with_obs_defaults: bool = False,
    with_var_typo: bool = False,
    with_obs_typo: bool = False,
    with_uns_organism: bool = False,
    with_uns_spatial: bool = False,
) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
    var_id = "invalid_ensembl_id" if with_var_typo else "ENSG00000000457"
    var_ids = [var_id, "ENSG00000000419", "ENSG00000139618"]
    lung_id = "UBERON:0002048XXX" if with_obs_typo else "UBERON:0002048"

    obs_data = {
        "disease_ontology_term_id": [
            "MONDO:0004975",
            "MONDO:0004980",
            "MONDO:0004980",
        ],
        "development_stage_ontology_term_id": ["unknown", "unknown", "unknown"],
        "sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
        "tissue_ontology_term_id": [lung_id, lung_id, "UBERON:0000948"],
        "cell_type": ["T cell", "B cell", "B cell"],
        "self_reported_ethnicity": ["South Asian", "South Asian", "South Asian"],
        "donor_id": ["-1", "1", "2"],
        "is_primary_data": [False, False, False],
        "suspension_type": ["cell", "cell", "cell"],
        "tissue_type": ["tissue", "tissue", "tissue"],
    }

    obs_df = pd.DataFrame(
        obs_data,
        index=["barcode1", "barcode2", "barcode3"],
    )

    var_df = pd.DataFrame(
        index=var_ids, data={"feature_is_filtered": [False, False, False]}
    )

    X = pd.DataFrame(
        {
            var_ids[0]: [2, 3, 3],
            var_ids[1]: [3, 4, 5],
            var_ids[2]: [4, 2, 3],
        },
        index=["barcode1", "barcode2", "barcode3"],
        dtype="float32",
    )

    obs_df["donor_id"] = obs_df["donor_id"].astype("category")

    if otype == "DataFrame":
        return pd.concat([X, obs_df], axis=1)
    else:
        adata = ad.AnnData(X=X, obs=obs_df, var=var_df)
        adata.uns["title"] = "CELLxGENE example"
        adata.obsm["X_pca"] = np.array(
            [[-1.2, 0.8], [0.5, -0.3], [0.7, -0.5]], dtype="float32"
        )
        # CELLxGENE requires the `.raw` slot to be set - https://github.com/chanzuckerberg/single-cell-curation/issues/1304
        adata.raw = adata.copy()
        adata.raw.var.drop(columns="feature_is_filtered", inplace=True)

        if with_obs_defaults:
            adata.obs["cell_type_ontology_term_id"] = [
                "CL:0000084",
                "CL:0000236",
                "CL:0000236",
            ]
            adata.obs["self_reported_ethnicity_ontology_term_id"] = "na"
            adata.obs["assay_ontology_term_id"] = "EFO:1001982"
            adata.obs["assay"] = "single-cell RNA sequencing"
        if with_uns_organism:
            adata.uns["organism_ontology_term_id"] = "NCBITaxon:9606"
            adata.uns["organism"] = "Homo sapiens"
        else:
            adata.obs["organism_ontology_term_id"] = "NCBITaxon:9606"
            obs_data["organism"] = ["Homo sapiens", "Homo sapiens", "Homo sapiens"]
        if with_uns_spatial:
            adata.uns["spatial"] = {
                "is_single": True,
                "library_123": {
                    "scalefactors": {
                        "spot_diameter_fullres": 165.0,
                        "tissue_hires_scalef": 0.5,
                    },
                    "images": {
                        "hires": np.random.default_rng().integers(
                            0, 255, (2000, 2000, 3), dtype=np.uint8
                        )
                    },
                },
            }

        return adata


def anndata_with_obs() -> ad.AnnData:
    """Create a mini anndata with cell_type, disease and tissue."""
    import anndata as ad
    import bionty.base as bionty_base

    celltypes = ["T cell", "hematopoietic stem cell", "hepatocyte", "my new cell type"]
    celltype_ids = ["CL:0000084", "CL:0000037", "CL:0000182", ""]
    diseases = [
        "chronic kidney disease",
        "liver lymphoma",
        "cardiac ventricle disorder",
        "Alzheimer disease",
    ]
    tissues = ["kidney", "liver", "heart", "brain"]
    df = pd.DataFrame()
    df["cell_type"] = celltypes * 10
    df["cell_type_id"] = celltype_ids * 10
    df["tissue"] = tissues * 10
    df["disease"] = diseases * 10
    df.index = "obs" + df.index.astype(str)

    adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df)
    bionty_genes = bionty_base.Gene()
    # backwards compatible
    adata.var.index = (
        (
            bionty_genes.to_dataframe()
            if hasattr(bionty_genes, "to_dataframe")
            else bionty_genes.df()
        )
        .head(100)["ensembl_gene_id"]
        .values
    )

    return adata


================================================
FILE: lamindb/examples/datasets/define_mini_immuno_features_labels.py
================================================
import bionty as bt

import lamindb as ln

# define valid labels
perturbation_type = ln.Record(name="Perturbation", is_type=True).save()
ln.Record(name="DMSO", type=perturbation_type).save()
ln.Record(name="IFNG", type=perturbation_type).save()
bt.CellType.from_source(name="B cell").save()
bt.CellType.from_source(name="T cell").save()

# define valid features
ln.Feature(name="perturbation", dtype=perturbation_type).save()
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save()
ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save()
ln.Feature(name="assay_oid", dtype=bt.ExperimentalFactor.ontology_id).save()
ln.Feature(name="concentration", dtype=str).save()
ln.Feature(name="treatment_time_h", dtype="num", coerce=True).save()
ln.Feature(name="donor", dtype=str, nullable=True).save()
ln.Feature(name="donor_ethnicity", dtype=list[bt.Ethnicity]).save()


================================================
FILE: lamindb/examples/datasets/define_mini_immuno_schema_flexible.py
================================================
import lamindb as ln

schema = ln.Schema(
    name="Mini immuno schema",
    features=[
        ln.Feature.get(name="perturbation"),
        ln.Feature.get(name="cell_type_by_model"),
        ln.Feature.get(name="assay_oid"),
        ln.Feature.get(name="donor"),
        ln.Feature.get(name="concentration"),
        ln.Feature.get(name="treatment_time_h"),
    ],
    flexible=True,  # _additional_ columns in a dataframe are validated & annotated
).save()


================================================
FILE: lamindb/examples/datasets/mini_immuno.py
================================================
"""Two "mini immuno" datasets.

Datasets
--------

.. autofunction:: get_dataset1
.. autofunction:: get_dataset2

Schemas
-------

.. autofunction:: define_features_labels
.. autofunction:: define_mini_immuno_schema_flexible

Utilities
---------

.. autofunction:: save_mini_immuno_datasets

"""

from __future__ import annotations

from datetime import date
from typing import TYPE_CHECKING, Literal

import anndata as ad
import pandas as pd

if TYPE_CHECKING:
    from lamindb.models import Schema


def define_features_labels() -> None:
    """Features & labels to validate the mini immuno datasets.

    .. literalinclude:: scripts/define_mini_immuno_features_labels.py
        :language: python
    """
    from . import define_mini_immuno_features_labels  # noqa


def define_mini_immuno_schema_flexible() -> Schema:
    """Features & labels to validate the mini immuno datasets.

    .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
        :language: python
    """
    from lamindb.models import Schema

    define_features_labels()
    from . import define_mini_immuno_schema_flexible  # noqa

    return Schema.get(name="Mini immuno schema")


def save_mini_immuno_datasets():
    """Save the two "mini immuno" datasets.

    .. literalinclude:: scripts/save_mini_immuno_datasets.py
        :language: python
    """
    from . import save_mini_immuno_datasets  # noqa


def get_dataset1(
    otype: Literal["DataFrame", "AnnData"] = "DataFrame",
    gene_symbols_in_index: bool = False,
    with_typo: bool = False,
    with_cell_type_synonym: bool = False,
    with_cell_type_typo: bool = False,
    with_gene_typo: bool = False,
    with_outdated_gene: bool = False,
    with_wrong_subtype: bool = False,
    with_index_type_mismatch: bool = False,
    with_date_as_iso_string: bool = True,
) -> pd.DataFrame | ad.AnnData:
    """A small tabular dataset measuring expression & metadata."""
    # define the data in the dataset
    # it's a mix of numerical measurements and observation-level metadata
    ifng = "IFNJ" if with_typo else "IFNG"
    thing = "ulabel_but_not_perturbation" if with_wrong_subtype else "DMSO"
    if gene_symbols_in_index:
        var_ids = ["CD8A", "CD4", "CD14" if not with_gene_typo else "GeneTypo"]
    else:
        var_ids = [
            "ENSG00000153563",
            "ENSG00000010610",
            "ENSG00000170458"
            if not with_gene_typo
            else "GeneTypo"
            if not with_outdated_gene
            else "ENSG00000278198",
        ]
    abt_cell = (
        "CD8-pos alpha-beta T cell"
        if with_cell_type_typo
        else "CD8-positive, alpha-beta T cell"
    )
    dataset_dict = {
        var_ids[0]: [1, 2, 3],
        var_ids[1]: [3, 4, 5],
        var_ids[2]: [5, 6, 7],
        "perturbation": pd.Categorical(["DMSO", ifng, thing]),
        "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
        "cell_type_by_expert": pd.Categorical(
            ["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
        ),
        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
        "assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
        "concentration": ["0.1%", "200 nM", "0.1%"],
        "treatment_time_h": [24, 24, 6],
        "donor": ["D0001", "D0002", None],
        "donor_ethnicity": [
            ["Chinese", "Singaporean Chinese"],
            ["Chinese", "Han Chinese"],
            ["Chinese"],
        ],
    }
    # define the dataset-level metadata
    metadata = {
        "temperature": 21.6,
        "experiment": "Experiment 1",
        "date_of_study": "2024-12-01" if with_date_as_iso_string else date(2024, 12, 1),
        "study_note": "We had a great time performing this study and the results look compelling.",
    }
    # the dataset as DataFrame
    dataset_df = pd.DataFrame(
        dataset_dict,
        index=["sample1", "sample2", 0]  # type: ignore
        if with_index_type_mismatch
        else ["sample1", "sample2", "sample3"],
    )
    if otype == "DataFrame":
        for key, value in metadata.items():
            dataset_df.attrs[key] = value
        return dataset_df
    else:
        del dataset_df[
            "donor_ethnicity"
        ]  # remove the donor_ethnicity because AnnData save will error
        dataset_ad = ad.AnnData(
            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
        )
        return dataset_ad


def get_dataset2(
    otype: Literal["DataFrame", "AnnData"] = "DataFrame",
    gene_symbols_in_index: bool = False,
    with_date_as_iso_string: bool = True,
) -> pd.DataFrame | ad.AnnData:
    """A second small tabular dataset measuring expression & metadata."""
    if gene_symbols_in_index:
        var_ids = ["CD8A", "CD4", "CD38"]
    else:
        var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
    dataset_dict = {
        var_ids[0]: [2, 3, 3],
        var_ids[1]: [3, 4, 5],
        var_ids[2]: [4, 2, 3],
        "perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
        "concentration": ["0.1%", "200 nM", "0.1%"],
        "treatment_time_h": [24, 24, 6],
        "donor": ["D0003", "D0003", "D0004"],
    }
    metadata = {
        "temperature": 22.6,
        "experiment": "Experiment 2",
        "date_of_study": "2025-02-13" if with_date_as_iso_string else date(2025, 2, 13),
    }
    dataset_df = pd.DataFrame(
        dataset_dict,
        index=["sample4", "sample5", "sample6"],
    )
    ad.AnnData(
        dataset_df[var_ids],
        obs=dataset_df[["perturbation", "cell_type_by_model"]],
    )
    if otype == "DataFrame":
        for key, value in metadata.items():
            dataset_df.attrs[key] = value
        return dataset_df
    else:
        dataset_ad = ad.AnnData(
            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
        )
        return dataset_ad


================================================
FILE: lamindb/examples/datasets/save_mini_immuno_datasets.py
================================================
from datetime import date

import bionty as bt

import lamindb as ln

## define valid labels
ln.Record.from_values(["DMSO", "IFNG"], create=True).save()
ln.Record.from_values(["Experiment 1", "Experiment 2"], create=True).save()
bt.CellType.from_values(["B cell", "T cell"]).save()

# observation-level metadata
ln.Feature(name="perturbation", dtype=ln.Record).save()
ln.Feature(name="sample_note", dtype=str).save()
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save()
ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save()
# dataset-level metadata
ln.Feature(name="temperature", dtype=float).save()
ln.Feature(name="experiment", dtype=ln.Record).save()
ln.Feature(name="date_of_study", dtype=date, coerce=True).save()
ln.Feature(name="study_note", dtype=str).save()
ln.Feature(name="study_metadata", dtype=dict).save()

schema = ln.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs()

## Ingest dataset1
adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData")
artifact = ln.Artifact.from_anndata(
    adata,
    key="examples/dataset1.h5ad",
    schema=schema,
).save()
adhoc = {"study_metadata": {"detail1": "123", "detail2": 1}}
dataset_metadata = adata.uns
dataset_metadata.update(adhoc)
artifact.features.add_values(dataset_metadata)  # type: ignore

# Ingest dataset2
adata2 = ln.examples.datasets.mini_immuno.get_dataset2(otype="AnnData")
artifact2 = ln.Artifact.from_anndata(
    adata2,
    key="examples/dataset2.h5ad",
    schema=schema,
).save()
adhoc2 = {"study_metadata": {"detail1": "456", "detail2": 2}}
dataset_metadata2 = adata2.uns
dataset_metadata2.update(adhoc2)
artifact2.features.add_values(dataset_metadata2)  # type: ignore


================================================
FILE: lamindb/examples/fixtures/__init__.py
================================================


================================================
FILE: lamindb/examples/fixtures/sheets.py
================================================
import bionty as bt
import pandas as pd
import pytest

import lamindb as ln


@pytest.fixture(scope="module")
def populate_sheets_compound_treatment():
    # Compounds ---------------------------

    compound_type = ln.Record(name="Compound", is_type=True).save()

    # features for compounds
    structure = ln.Feature(name="structure", dtype="str").save()

    # drug1
    drug1 = ln.Record(name="drug1", type=compound_type).save()
    ln.models.RecordJson(record=drug1, feature=structure, value="12345").save()
    # drug2
    drug2 = ln.Record(name="drug2", type=compound_type).save()
    ln.models.RecordJson(record=drug2, feature=structure, value="45678").save()

    # Treatments ---------------------------

    treatment_type = ln.Record(name="Treatment", is_type=True).save()

    # features for treatments
    compound = ln.Feature(name="compound", dtype=compound_type).save()
    concentration = ln.Feature(name="concentration", dtype="num").save()
    # a sheet for treatments
    treatments_sheet = ln.Record(
        name="My treatments 2025-05", type=treatment_type, is_type=True
    ).save()  # sheet without validating schema

    # populate treatment1
    treatment1 = ln.Record(name="treatment1", type=treatments_sheet).save()
    ln.models.RecordRecord(record=treatment1, feature=compound, value=drug1).save()
    assert drug1 in treatment1.linked_records.all()
    assert treatment1 in drug1.linked_in_records.all()
    ln.models.RecordJson(record=treatment1, feature=concentration, value="2nM").save()
    # populate treatment2
    treatment2 = ln.Record(name="treatment2", type=treatments_sheet).save()
    ln.models.RecordRecord(record=treatment2, feature=compound, value=drug2).save()
    ln.models.RecordJson(record=treatment2, feature=concentration, value="4nM").save()

    # Samples ---------------------------

    # features named id, uid or name conflict with django field names, we test them here
    id_feature = ln.Feature(name="id", dtype=int).save()
    uid_feature = ln.Feature(name="uid", dtype=str).save()
    name_feature = ln.Feature(name="name", dtype=str).save()

    project = ln.Feature(name="project", dtype=ln.Project).save()
    project1 = ln.Project(name="Project 1").save()
    sample_type = ln.Record(name="BioSample", is_type=True).save()
    treatment = ln.Feature(name="treatment", dtype=treatment_type).save()
    cell_line = ln.Feature(name="cell_line", dtype=bt.CellLine).save()
    preparation_date = ln.Feature(name="preparation_date", dtype="datetime").save()
    cell_line._dtype_str = (
        "cat[bionty.CellLine]"  # might have previously been set to "cat"
    )
    cell_line.save()
    sample_schema1 = ln.Schema(
        name="My samples schema 2025-06",
        features=[
            id_feature,
            uid_feature,
            name_feature,
            treatment,
            cell_line,
            preparation_date,
            project,
        ],
    ).save()
    sample_sheet1 = ln.Record(
        name="My samples 2025-06", schema=sample_schema1, type=sample_type
    ).save()
    # values for cell lines
    hek293t = bt.CellLine.from_source("HEK293T").save()

    # populate sample1
    sample1 = ln.Record(name="sample1", type=sample_sheet1).save()
    ln.models.RecordJson(record=sample1, feature=id_feature, value=1).save()
    ln.models.RecordJson(record=sample1, feature=uid_feature, value="S1").save()
    ln.models.RecordJson(record=sample1, feature=name_feature, value="Sample 1").save()
    ln.models.RecordRecord(record=sample1, feature=treatment, value=treatment1).save()
    bt.models.RecordCellLine(record=sample1, feature=cell_line, value=hek293t).save()
    ln.models.RecordJson(
        record=sample1, feature=preparation_date, value="2025-06-01T05:00:00"
    ).save()
    ln.models.RecordProject(record=sample1, feature=project, value=project1).save()
    # populate sample2
    sample2 = ln.Record(name="sample2", type=sample_sheet1).save()
    ln.models.RecordJson(record=sample2, feature=id_feature, value=2).save()
    ln.models.RecordJson(record=sample2, feature=uid_feature, value="S2").save()
    ln.models.RecordJson(record=sample2, feature=name_feature, value="Sample 2").save()
    ln.models.RecordRecord(record=sample2, feature=treatment, value=treatment2).save()
    bt.models.RecordCellLine(record=sample2, feature=cell_line, value=hek293t).save()
    ln.models.RecordJson(
        record=sample2, feature=preparation_date, value="2025-06-01T06:00:00"
    ).save()
    ln.models.RecordProject(record=sample2, feature=project, value=project1).save()

    # another sheet for samples
    sample_note = ln.Feature(name="sample_note", dtype="str").save()
    sample_schema2 = ln.Schema(
        name="My samples schema 2025-07",
        features=[treatment, cell_line, sample_note, project],
    ).save()
    # the sheet
    sample_sheet2 = ln.Record(
        name="My samples 2025-07", schema=sample_schema2, type=sample_type
    ).save()
    # populate sample3
    sample3 = ln.Record(type=sample_sheet2).save()  # no name
    ln.models.RecordRecord(record=sample3, feature=treatment, value=treatment1).save()
    bt.models.RecordCellLine(record=sample3, feature=cell_line, value=hek293t).save()
    ln.models.RecordJson(
        record=sample3, feature=preparation_date, value="2025-06-02T05:00:00Z"
    ).save()
    ln.models.RecordProject(record=sample3, feature=project, value=project1).save()
    # populate sample4
    sample4 = ln.Record(type=sample_sheet2).save()
    ln.models.RecordRecord(record=sample4, feature=treatment, value=treatment2).save()
    bt.models.RecordCellLine(record=sample4, feature=cell_line, value=hek293t).save()
    ln.models.RecordJson(
        record=sample4, feature=preparation_date, value="2025-06-02T06:00:00Z"
    ).save()
    ln.models.RecordProject(record=sample4, feature=project, value=project1).save()

    yield treatments_sheet, sample_sheet1

    sample4.delete(permanent=True)
    sample3.delete(permanent=True)
    sample_sheet2.delete(permanent=True)
    sample_schema2.delete(permanent=True)
    sample_note.delete(permanent=True)
    sample2.delete(permanent=True)
    sample1.delete(permanent=True)
    # hek293t.delete(permanent=True)  # not for now
    sample_sheet1.delete(permanent=True)
    sample_schema1.delete(permanent=True)
    preparation_date.delete(permanent=True)
    cell_line.delete(permanent=True)
    # sample_type.delete(permanent=True)   # not for now
    treatment2.delete(permanent=True)
    treatment1.delete(permanent=True)
    treatments_sheet.delete(permanent=True)
    treatment_type.delete(permanent=True)
    concentration.delete(permanent=True)
    drug2.delete(permanent=True)
    drug1.delete(permanent=True)
    structure.delete(permanent=True)
    compound.delete(permanent=True)
    compound_type.delete(permanent=True)


@pytest.fixture(scope="module")
def populate_nextflow_sheet_with_samples():
    # Biosample schema and type
    samples_schema = ln.Schema(
        name="Biosample test schema",
        features=[
            ln.Feature(name="species", dtype="cat[bionty.Organism]").save(),
            ln.Feature(name="cell_type", dtype="cat[bionty.CellType]").save(),
            ln.Feature(name="tissue", dtype="cat[bionty.Tissue]").save(),
        ],
    ).save()

    biosample_type = ln.Record(name="BioSample", is_type=True).save()

    # Biosamples sheet
    samples_sheet = ln.Record(
        name="My samples 2025-04", schema=samples_schema, type=biosample_type
    ).save()
    sample_x = ln.Record(name="Sample_X", type=samples_sheet).save()
    sample_y = ln.Record(name="Sample_Y", type=samples_sheet).save()

    organism_human = bt.Organism.from_source(name="human").save()
    celltype_tcell = bt.CellType.from_source(name="T cell").save()
    tissue_blood = bt.Tissue.from_source(name="blood").save()

    features = ln.Feature.lookup()
    for sample in [sample_x, sample_y]:
        bt.models.RecordOrganism(
            record=sample, feature=features.species, value=organism_human
        ).save()
        bt.models.RecordCellType(
            record=sample, feature=features.cell_type, value=celltype_tcell
        ).save()
        bt.models.RecordTissue(
            record=sample, feature=features.tissue, value=tissue_blood
        ).save()

    # Nextflow samplesheet schema
    nextflow_schema = ln.Schema(
        name="RNA-seq standard",
        features=[
            ln.Feature(name="sample", dtype=biosample_type).save(),
            ln.Feature(name="fastq_1", dtype=str).save(),
            ln.Feature(name="fastq_2", dtype=str).save(),
            ln.Feature(name="expected_cells", dtype=int).save(),
            ln.Feature(name="seq_center", dtype=str).save().with_config(optional=True),
        ],
        ordered_set=True,
    ).save()

    nextflowsample_type = ln.Record(name="NextflowSample", is_type=True).save()
    nextflow_sheet = ln.Record(
        schema=nextflow_schema,
        name="RNA-seq nextflow samplesheet 001",
        type=nextflowsample_type,
        is_type=True,
    ).save()

    sample_data = {
        "sample": ["Sample_X", "Sample_Y", "Sample_Y"],
        "fastq_1": [
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz",
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R1_001.fastq.gz",
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R1_001.fastq.gz",
        ],
        "fastq_2": [
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz",
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R2_001.fastq.gz",
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R2_001.fastq.gz",
        ],
        "expected_cells": [5000, 5000, 5000],
    }
    df = pd.DataFrame(sample_data)

    features = ln.Feature.lookup()
    nextflow_samples = []
    for _, row in df.iterrows():
        sample = ln.Record(type=nextflow_sheet).save()
        nextflow_samples.append(sample)
        ln.models.RecordRecord(
            record=sample,
            feature=features.sample,
            value=ln.Record.get(name=row["sample"]),
        ).save()
        ln.models.RecordJson(
            record=sample, feature=features.fastq_1, value=row["fastq_1"]
        ).save()
        ln.models.RecordJson(
            record=sample, feature=features.fastq_2, value=row["fastq_2"]
        ).save()
        ln.models.RecordJson(
            record=sample, feature=features.expected_cells, value=row["expected_cells"]
        ).save()

    yield nextflow_sheet

    # Delete in reverse order of creation
    # Delete nextflow samples
    for sample in reversed(nextflow_samples):
        sample.delete(permanent=True)

    # Delete nextflow sheet and schema
    nextflow_sheet.delete(permanent=True)
    nextflowsample_type.delete(permanent=True)
    nextflow_schema.delete(permanent=True)

    # Delete samples sheet and schema
    samples_sheet.records.all().delete(permanent=True)
    samples_sheet.delete(permanent=True)
    # biosample_type.delete(permanent=True)  # not for now (shared with first fixture)
    samples_schema.delete(permanent=True)

    print(ln.Schema.to_dataframe())

    # Delete nextflow schema features
    features = ln.Feature.lookup()
    features.seq_center.delete(permanent=True)
    features.expected_cells.delete(permanent=True)
    features.fastq_2.delete(permanent=True)
    features.fastq_1.delete(permanent=True)
    features.sample.delete(permanent=True)

    # Delete biosamples
    sample_y.delete(permanent=True)
    sample_x.delete(permanent=True)

    # Delete biosample schema features
    features.tissue.delete(permanent=True)
    features.cell_type.delete(permanent=True)
    features.species.delete(permanent=True)

    # Note: organism_human, celltype_tcell, tissue_blood are from bionty
    # and might be shared, so not deleting them (similar to hek293t in first fixture)


================================================
FILE: lamindb/examples/mlflow/__init__.py
================================================
"""Examples and utilities for Mlflow.

.. autofunction:: save_mlflow_features
"""

import lamindb as ln


def save_mlflow_features():
    """Saves all MLflow experiment and run related features.

    Saves the following features:

    - mlflow_run_id
    - mlflow_run_name
    - mlflow_experiment_id
    - mlflow_experiment_name
    - mlflow_user_id
    - mlflow_status
    - mlflow_lifecycle_stage
    - mlflow_artifact_uri
    - mlflow_start_time
    - mlflow_end_time
    """
    mlflow_type = ln.Feature(name="MLflow", is_type=True).save()
    ln.Feature(name="mlflow_run_id", dtype=str, type=mlflow_type).save()
    ln.Feature(name="mlflow_run_name", dtype=str, type=mlflow_type).save()
    ln.Feature(name="mlflow_experiment_id", dtype=str, type=mlflow_type).save()
    ln.Feature(name="mlflow_experiment_name", dtype=str, type=mlflow_type).save()
    ln.Feature(name="mlflow_user_id", dtype=str, type=mlflow_type).save()
    ln.Feature(name="mlflow_status", dtype=str, type=mlflow_type).save()
    ln.Feature(name="mlflow_lifecycle_stage", dtype=str, type=mlflow_type).save()
    ln.Feature(name="mlflow_artifact_uri", dtype=str, type=mlflow_type).save()
    ln.Feature(name="mlflow_start_time", dtype=int, type=mlflow_type).save()
    ln.Feature(name="mlflow_end_time", dtype=int, type=mlflow_type).save()


================================================
FILE: lamindb/examples/schemas/__init__.py
================================================
"""Example schemas.

.. autofunction:: valid_features
.. autofunction:: anndata_ensembl_gene_ids_and_valid_features_in_obs

"""

from ._anndata import anndata_ensembl_gene_ids_and_valid_features_in_obs
from ._simple import valid_features


================================================
FILE: lamindb/examples/schemas/_anndata.py
================================================
from __future__ import annotations

import importlib
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from ... import Schema


def anndata_ensembl_gene_ids_and_valid_features_in_obs() -> Schema:
    """An `AnnData` schema validating Ensembl gene IDs and valid features in obs.

    .. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
        :language: python
    """
    from ... import Schema

    try:
        return Schema.get(name="anndata_ensembl_gene_ids_and_valid_features_in_obs")
    except Schema.DoesNotExist:
        from . import define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs  # noqa

        try:
            return Schema.get(name="anndata_ensembl_gene_ids_and_valid_features_in_obs")
        except Schema.DoesNotExist:
            importlib.reload(
                define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs
            )
            return Schema.get(name="anndata_ensembl_gene_ids_and_valid_features_in_obs")


================================================
FILE: lamindb/examples/schemas/_simple.py
================================================
from __future__ import annotations

import importlib
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from ... import Schema


def valid_features() -> Schema:
    """A `DataFrame` schema that validates that columns map on existing features.

    .. literalinclude:: scripts/define_valid_features.py
        :language: python
    """
    from ... import Schema

    try:
        return Schema.get(name="valid_features")
    except Schema.DoesNotExist:
        try:
            from . import define_valid_features  # noqa

            return Schema.get(name="valid_features")
        except Schema.DoesNotExist:
            importlib.reload(define_valid_features)
            return Schema.get(name="valid_features")


================================================
FILE: lamindb/examples/schemas/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
================================================
import bionty as bt

import lamindb as ln

obs_schema = ln.examples.schemas.valid_features()
varT_schema = ln.Schema(
    name="valid_ensembl_gene_ids", itype=bt.Gene.ensembl_gene_id
).save()
schema = ln.Schema(
    name="anndata_ensembl_gene_ids_and_valid_features_in_obs",
    otype="AnnData",
    slots={"obs": obs_schema, "var.T": varT_schema},
).save()


================================================
FILE: lamindb/examples/schemas/define_valid_features.py
================================================
import lamindb as ln

schema = ln.Schema(name="valid_features", itype=ln.Feature).save()


================================================
FILE: lamindb/examples/wandb/__init__.py
================================================
"""Examples and utilities for Weights & Biases.

.. autofunction:: save_wandb_features
"""

import lamindb as ln


def save_wandb_features():
    """Saves all Weights & Biases project and run related features.

    Saves the following features:

    - wandb_run_id
    - wandb_run_name
    - wandb_run_entity
    - wandb_project
    - wandb_state
    - wandb_url
    - wandb_tags
    - wandb_group
    - wandb_job_type
    - timestamp
    - runtime
    """
    wandb_type = ln.Feature(name="Weights & Biases", is_type=True).save()
    ln.Feature(name="wandb_run_id", dtype=str, type=wandb_type).save()
    ln.Feature(name="wandb_run_name", dtype=str, type=wandb_type).save()
    ln.Feature(name="wandb_run_entity", dtype=str, type=wandb_type).save()
    ln.Feature(name="wandb_project", dtype=str, type=wandb_type).save()
    ln.Feature(name="wandb_state", dtype=str, type=wandb_type).save()
    ln.Feature(name="wandb_url", dtype=str, type=wandb_type).save()
    ln.Feature(name="wandb_tags", dtype=str, type=wandb_type).save()
    ln.Feature(name="wandb_group", dtype=str, type=wandb_type).save()
    ln.Feature(name="wandb_job_type", dtype=str, type=wandb_type).save()
    ln.Feature(name="wandb_timestamp", dtype=float, type=wandb_type).save()
    ln.Feature(name="wandb_runtime", dtype=float, type=wandb_type).save()


================================================
FILE: lamindb/integrations/__init__.py
================================================
"""Integrations.

Modules
-------

.. autosummary::
   :toctree: .

   lightning

Functions
---------

.. autofunction:: save_vitessce_config
.. autofunction:: save_tiledbsoma_experiment
.. autofunction:: curate_from_croissant

"""

from ._croissant import curate_from_croissant
from ._vitessce import save_vitessce_config

__all__ = [
    "lightning",
    "save_tiledbsoma_experiment",
    "curate_from_croissant",
    "save_vitessce_config",
]


def __getattr__(name: str):
    """Lazy-import save_tiledbsoma_experiment to avoid loading storage at package import."""
    if name == "save_tiledbsoma_experiment":
        from lamindb.core.storage import save_tiledbsoma_experiment

        return save_tiledbsoma_experiment
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


================================================
FILE: lamindb/integrations/_croissant.py
================================================
from __future__ import annotations

import json
from pathlib import Path
from typing import TYPE_CHECKING, Any

import lamindb_setup as ln_setup
from lamin_utils import logger
from lamindb_setup.core.upath import UPath

if TYPE_CHECKING:
    from lamindb_setup.types import AnyPathStr

    import lamindb as ln


def curate_from_croissant(
    croissant_data: AnyPathStr | dict[str, Any],
    run: ln.Run | None = None,
) -> ln.Artifact | ln.Collection:
    """Create annotated artifacts from a CroissantML file.

    Returns a collection if multiple files are found in `croissant_data`, otherwise a single artifact.

    Args:
        croissant_data: Path to CroissantML JSON file or dictionary.

    Example:

        ::

            artifact = ln.integrations.curate_from_croissant("dataset_metadata.json")
    """
    import lamindb as ln

    from ..models.artifact import check_path_in_existing_storage

    # Load CroissantML data
    if isinstance(croissant_data, (str, Path, UPath)):
        croissant_path = UPath(croissant_data)
        if not croissant_path.exists():
            raise FileNotFoundError(f"File not found: {croissant_data}")
        with croissant_path.open(encoding="utf-8") as f:
            data = json.load(f)
    elif isinstance(croissant_data, dict):
        data = croissant_data
    else:
        raise ValueError(
            "croissant_data must be a file path, JSON string, or dictionary"
        )

    # Validate basic structure
    if data.get("@type") != "Dataset":
        raise ValueError("CroissantML @type must be 'Dataset'")

    if "name" not in data:
        raise ValueError("CroissantML must have a 'name' field")

    # Extract basic metadata
    dataset_name = data["name"]
    description = data.get("description", None)
    version = data.get("version", None)
    license_info = data.get("license", None)
    project_name = data.get("cr:projectName", None)

    # Create license feature and label if license info exists
    license_label = None
    if license_info:
        license_label_type = ln.ULabel.filter(name="License", is_type=True).first()
        if not license_label_type:
            license_label_type = ln.ULabel(name="License", is_type=True).save()
        license_label = ln.ULabel.filter(name=license_info).first()
        if not license_label:
            license_label = ln.ULabel(
                name=license_info,
                description="Dataset license",
                type=license_label_type,
            ).save()
    project_label = None
    if project_name:
        project_label = ln.Project.filter(name=project_name).first()
        if not project_label:
            project_label = ln.Project(name=project_name).save()

    # Extract file distributions
    artifacts = []
    file_distributions = data.get("distribution", [])
    if not file_distributions:
        raise ValueError("No file distributions found in croissant data")
    for dist in file_distributions:
        file_id = dist.get("@id", "")
        if UPath(file_id).exists():
            file_path = file_id
        else:
            content_url = dist.get("contentUrl", "")
            file_path = content_url or data.get("url", "")
        if not file_path:
            raise ValueError(f"No file path found in croissant distribution: {dist}")
        if not UPath(file_path).exists():
            raise ValueError(f"Inferred file path does not exist: {file_path}")
        result = check_path_in_existing_storage(
            file_path, check_hub_register_storage=ln_setup.settings.instance.is_on_hub
        )
        if isinstance(result, ln.Storage):
            key = None  # will automatically use existing storage key
        else:
            current_storage_location = (
                ln.settings.storage
                if not ln.setup.settings.instance.keep_artifacts_local
                else ln.settings.local_storage
            )
            logger.warning(
                f"file path {file_path} is not part of a known storage location, will be duplicated to: {current_storage_location}"
            )
            key = file_id
        if len(file_distributions) == 1:
            # it doesn't make sense to have the dataset name on the individual
            # artifact if it's part of a collection
            artifact_description = dataset_name
            if description is not None:
                artifact_description += f" - {description}"
        else:
            artifact_description = None
        artifact = ln.Artifact(  # type: ignore
            file_path,
            key=key,
            description=artifact_description,
            version=version,
            kind="dataset",
            run=run,
        ).save()
        if license_label:
            artifact.ulabels.add(license_label)
        if project_label:
            artifact.projects.add(project_label)
        artifacts.append(artifact)

    if len(artifacts) == 1:
        return artifacts[0]
    else:
        collection = ln.Collection(  # type: ignore
            artifacts, key=dataset_name, description=description, version=version
        ).save()
        if license_label:
            collection.ulabels.add(license_label)
        if project_label:
            collection.projects.add(project_label)
        return collection


================================================
FILE: lamindb/integrations/_vitessce.py
================================================
from __future__ import annotations

import json
from datetime import datetime, timezone
from typing import TYPE_CHECKING

import lamindb_setup as ln_setup
from lamin_utils import logger

from lamindb.models.artifact import Artifact
from lamindb.models.collection import Collection
from lamindb.models.run import Run
from lamindb.models.transform import Transform

if TYPE_CHECKING:
    from vitessce import VitessceConfig


# "unit test": https://github.com/laminlabs/lamindb/blob/main/docs/storage/vitessce.ipynb
# integration test & context: https://github.com/laminlabs/lamin-spatial/blob/main/docs/vitessce.ipynb
def save_vitessce_config(
    vitessce_config: VitessceConfig,
    key: str | None = None,
    description: str | None = None,
) -> Artifact:
    """Validates and saves a `VitessceConfig` object.

    If the `VitessceConfig` object references multiple artifacts, automatically
    creates a `Collection` and displays the "Vitessce button" next to it.

    The `VitessceConfig` artifact has `.suffix = ".vitessce.json"` and `.kind = "__lamindb_config__"`,
    which is by default hidden on the hub UI.

    Guide: :doc:`docs:vitessce`.

    Args:
        vitessce_config: A `VitessceConfig` object.
        key: A `key` for the `VitessceConfig` artifact.
        description: A `description` for the `VitessceConfig` aritifact. Is additionally
            used as `key` for a `Collection` in case the `VitessceConfig` object
            references multiple artifacts.
    """
    # can only import here because vitessce is not a dependency
    from vitessce import VitessceConfig

    assert isinstance(vitessce_config, VitessceConfig)  # noqa: S101
    vc_dict = vitessce_config.to_dict()
    try:
        url_to_artifact_dict = vitessce_config.get_artifacts()
    except AttributeError as e:
        raise SystemExit(
            "save_vitessce_config() requires vitessce>=3.4.0: pip install vitessce>=3.4.0"
        ) from e
    dataset_artifacts = list(url_to_artifact_dict.values())
    message = "\n".join([artifact.__repr__() for artifact in dataset_artifacts])
    logger.important(f"VitessceConfig references these artifacts:\n{message}")
    assert len(dataset_artifacts) > 0  # noqa: S101

    # the below will be replaced with a `ln.step()` decorator soon
    transform = Transform(  # type: ignore
        uid="kup03MJBsIVa0002",
        key="save_vitessce_config",
        type="function",
        version="3",
    ).save()
    run = Run(transform=transform).save()
    run.input_artifacts.set(dataset_artifacts)
    collection = None
    if len(dataset_artifacts) > 1:
        # if we have more datasets, we should create a collection
        # and attach an action to the collection
        # consicious use of description for key, see here
        # https://github.com/laminlabs/lamindb/pull/2997
        collection = Collection(dataset_artifacts, key=description).save()

    # create a JSON export
    config_file_local_path = ln_setup.settings.cache_dir / "config.vitessce.json"
    with open(config_file_local_path, "w") as file:
        json.dump(vc_dict, file)
    vitessce_config_artifact = Artifact(
        config_file_local_path,
        key=key,
        description=description,
        run=run,
        kind="__lamindb_config__",
    ).save()
    slug = ln_setup.settings.instance.slug
    logger.important(
        f"VitessceConfig: https://lamin.ai/{slug}/artifact/{vitessce_config_artifact.uid}"
    )
    if collection is None:
        # we have one and only one dataset artifact, hence the following line is OK
        dataset_artifacts[0]._actions.add(vitessce_config_artifact)
        logger.important(
            f"Dataset: https://lamin.ai/{slug}/artifact/{dataset_artifacts[0].uid}"
        )
    else:
        collection._actions.add(vitessce_config_artifact)
        logger.important(
            f"Collection: https://lamin.ai/{slug}/collection/{collection.uid}"
        )
    run.finished_at = datetime.now(timezone.utc)
    run.save()
    return vitessce_config_artifact


================================================
FILE: lamindb/integrations/lightning.py
================================================
"""PyTorch Lightning integration for LaminDB.

The public API has two layers:

- :class:`Checkpoint` is the concrete LaminDB implementation that persists checkpoint, config, and `hparams.yaml` files as :class:`~lamindb.Artifact` objects and annotates them with :class:`~lamindb.Feature` objects.
- :class:`ArtifactPublishingModelCheckpoint` is the generic extension layer adding checkpoint artifact lifecycle hooks without implementing Lamin persistence details yet.

External integrations can either subclass :class:`Checkpoint` directly or attach
an :class:`ArtifactObserver` to react to saved and removed artifacts.

Here is a guide: :doc:`lightning`.

Main API
--------

.. autoclass:: Checkpoint
.. autofunction:: save_lightning_features

Auxiliary classes
-----------------

.. autoclass:: ArtifactPublishingModelCheckpoint
.. autoclass:: SaveConfigCallback
.. autoclass:: ArtifactSavedEvent
.. autoclass:: ArtifactRemovedEvent
"""

from __future__ import annotations

import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, Final, Literal, Protocol

import lightning.pytorch as pl
from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint
from lightning.pytorch.cli import SaveConfigCallback as _SaveConfigCallback

import lamindb as ln
from lamindb.models.artifact import track_run_input

if TYPE_CHECKING:
    from datetime import timedelta

    from lightning.fabric.utilities.types import _PATH


_RUN_AUTO_FEATURES: Final = frozenset(
    {
        "logger_name",
        "logger_version",
        "max_epochs",
        "max_steps",
        "precision",
        "accumulate_grad_batches",
        "gradient_clip_val",
        "monitor",
        "mode",
    }
)
_ARTIFACT_AUTO_FEATURES: Final = frozenset(
    {
        "is_best_model",
        "is_last_model",
        "score",
        "model_rank",
        "save_weights_only",
        "monitor",
        "mode",
    }
)
_SUPPORTED_AUTO_FEATURES: Final = _RUN_AUTO_FEATURES | _ARTIFACT_AUTO_FEATURES
ArtifactKind = Literal["checkpoint", "config", "hparams"]


@dataclass(frozen=True)
class ArtifactEvent:
    """Common metadata emitted when a checkpoint-related artifact changes.

    The event records the logical artifact key, the local path Lightning wrote,
    and the trainer that triggered the lifecycle event.
    """

    kind: ArtifactKind
    key: str
    local_path: Path
    trainer: pl.Trainer


@dataclass(frozen=True)
class ArtifactSavedEvent(ArtifactEvent):
    """Metadata emitted after a checkpoint-related artifact has been persisted.

    `artifact` is intentionally typed generically so downstream integrations can
    expose their own persisted object while still using the common lifecycle API.
    `storage_uri` is the stable hand-off value for registries such as ClearML.
    """

    artifact: Any
    storage_uri: str


@dataclass(frozen=True)
class ArtifactRemovedEvent(ArtifactEvent):
    """Metadata emitted after a local checkpoint file has been removed.

    Removal currently applies to checkpoint files. Config and hparams artifacts are
    save-only in the current Lightning integration.
    """

    artifact: Any | None = None
    storage_uri: str | None = None


class ArtifactObserver(Protocol):
    """Observer notified about checkpoint artifact lifecycle events.

    This is the preferred composition hook for downstream integrations that need
    to register checkpoints elsewhere after Lamin persistence completes.
    """

    def on_artifact_saved(self, event: ArtifactSavedEvent) -> None: ...

    def on_artifact_removed(self, event: ArtifactRemovedEvent) -> None: ...


class ArtifactPublisher(Protocol):
    """Persistence backend for checkpoint-related artifacts.

    :class:`ArtifactPublishingModelCheckpoint` manages the artifact lifecycle,
    while publishers encapsulate backend-specific save behavior and storage URI
    resolution.
    """

    def create_artifact(
        self,
        local_path: Path | str,
        *,
        key: str,
        description: str,
        kind: str | None = None,
        add_as_input_to_run: bool = False,
        skip_hash_lookup: bool = False,
    ) -> Any: ...

    def storage_uri(self, artifact: Any) -> str: ...


class LaminArtifactPublisher:
    """Persist checkpoint-related artifacts into LaminDB.

    This service is intentionally separate from :class:`Checkpoint` so that the
    checkpoint callback can focus on Lightning behavior and feature handling while
    persistence details remain replaceable.
    """

    def create_artifact(
        self,
        local_path: Path | str,
        *,
        key: str,
        description: str,
        kind: str | None = None,
        add_as_input_to_run: bool = False,
        skip_hash_lookup: bool = False,
    ) -> ln.Artifact:
        artifact_kwargs: dict[str, Any] = {"key": key, "description": description}
        if kind is not None:
            artifact_kwargs["kind"] = kind
        if add_as_input_to_run:
            artifact_kwargs["run"] = False
        if skip_hash_lookup:
            artifact_kwargs["skip_hash_lookup"] = True
        artifact = ln.Artifact(local_path, **artifact_kwargs)
        artifact.save()
        if add_as_input_to_run:
            track_run_input(artifact, is_run_input=True)
        return artifact

    def storage_uri(self, artifact: ln.Artifact) -> str:
        return str(artifact.path)


def save_lightning_features() -> None:
    """Save features to auto-track lightning parameters & metrics.

    Creates the following features under the `lamindb.lightning` feature type if they do not already exist:

    Artifact-level features:

    - `is_best_model` (bool): Whether this checkpoint is the best model.
    - `is_last_model` (bool): Whether this checkpoint is the most recently saved model.
    - `score` (float): The monitored metric score.
    - `model_rank` (int): Rank among all checkpoints (0 = best).
    - `save_weights_only` (bool): Whether this checkpoint only stores model weights.
    - `monitor` (str): Metric name this checkpoint uses for comparison.
    - `mode` (str): Optimization mode (`min` or `max`) used for checkpoint ranking.

    Run-level features:

    - `logger_name` (str): Name from the first Lightning logger.
    - `logger_version` (str): Version from the first Lightning logger.
    - `max_epochs` (int): Maximum number of epochs.
    - `max_steps` (int): Maximum number of training steps.
    - `precision` (str): Training precision (e.g., "32", "16-mixed", "bf16").
    - `accumulate_grad_batches` (int): Number of batches to accumulate gradients over.
    - `gradient_clip_val` (float): Gradient clipping value.
    - `monitor` (str): Metric name being monitored.
    - `mode` (str): Optimization mode ("min" or "max").

    Args:
        None.

    Example:

        Save the features to the database::

            from lamindb.integrations import lightning as ll

            ll.save_lightning_features()
    """
    # normal matching fails because of non-matching dtype (__lamindb_lightning__ vs None)
    if (
        lightning_feature_type := ln.Feature.filter(
            name="lamindb.lightning"
        ).one_or_none()
    ) is None:
        lightning_feature_type = ln.Feature(  # type: ignore[call-overload]
            name="lamindb.lightning",
            description="Auto-generated features tracking lightning parameters & metrics",
            is_type=True,
        )
        lightning_feature_type._dtype_str = "__lamindb_lightning__"
        lightning_feature_type.save()

    ln.Feature(name="is_best_model", dtype=bool, type=lightning_feature_type).save()
    ln.Feature(name="is_last_model", dtype=bool, type=lightning_feature_type).save()
    ln.Feature(name="score", dtype=float, type=lightning_feature_type).save()
    ln.Feature(name="model_rank", dtype=int, type=lightning_feature_type).save()
    ln.Feature(name="logger_name", dtype=str, type=lightning_feature_type).save()
    ln.Feature(name="logger_version", dtype=str, type=lightning_feature_type).save()
    ln.Feature(name="max_epochs", dtype=int, type=lightning_feature_type).save()
    ln.Feature(name="max_steps", dtype=int, type=lightning_feature_type).save()
    ln.Feature(name="precision", dtype=str, type=lightning_feature_type).save()
    ln.Feature(
        name="accumulate_grad_batches", dtype=int, type=lightning_feature_type
    ).save()
    ln.Feature(
        name="gradient_clip_val", dtype=float, type=lightning_feature_type
    ).save()
    ln.Feature(name="monitor", dtype=str, type=lightning_feature_type).save()
    ln.Feature(name="save_weights_only", dtype=bool, type=lightning_feature_type).save()
    ln.Feature(name="mode", dtype=str, type=lightning_feature_type).save()


class FeatureAnnotator:
    """Manages Lightning feature discovery, collection, and annotation.

    This helper encapsulates all feature-related state and logic used by
    :class:`Checkpoint`.  It handles:

    - Validation of user-specified features at setup time
    - Discovery of auto-features created by :func:`save_lightning_features`
    - Collection of run-level and checkpoint-level feature values
    - Best-model flag management and model rank updates

    The annotator is decoupled from `ModelCheckpoint` state — checkpoint-specific
    values (`best_model_path`, `current_score`, `mode`, etc.) are passed as
    explicit arguments to collection methods.
    """

    def __init__(
        self,
        features: dict[Literal["run", "artifact"], dict[str, Any]] | None = None,
    ) -> None:
        user_features = features or {}
        if invalid_keys := set(user_features) - {"run", "artifact"}:  # type: ignore
            raise ValueError(
                f"Invalid feature keys: {invalid_keys}. Use 'run' and/or 'artifact'."
            )
        self._run_features: dict[str, Any] = user_features.get("run", {})
        self._artifact_features: dict[str, Any] = user_features.get("artifact", {})
        self._auto_features: dict[str, ln.Feature] = {}
        self._hparam_features_available: set[str] = set()
        self._run_features_saved = False

    def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
        """Validate user features and discover auto-features.

        Must be called during `Checkpoint.setup()` while `trainer.is_global_zero`
        is `True`.
        """
        self._validate_user_features()
        self._attach_user_run_features()
        self._discover_auto_features()
        self._discover_hparam_features(trainer, pl_module)

    def _attach_user_run_features(self) -> None:
        """Attach user-specified run features to the active LaminDB run."""
        if ln.context.run and self._run_features:
            ln.context.run.features.add_values(self._run_features)

    def _validate_user_features(self) -> None:
        """Ensure all user-specified feature names exist in the database."""
        all_feature_names = set(self._run_features) | set(self._artifact_features)
        if not all_feature_names:
            return
        existing = set(
            ln.Feature.filter(name__in=all_feature_names).values_list("name", flat=True)
        )
        missing = [n for n in all_feature_names if n not in existing]
        if missing:
            s = "s" if len(missing) > 1 else ""
            raise ValueError(
                f"Feature{s} {', '.join(missing)} missing. "
                f"Create {'them' if len(missing) > 1 else 'it'} first."
            )

    def _discover_auto_features(self) -> None:
        """Load auto-features scoped to the `lamindb.lightning` feature type."""
        lightning_feature_type = ln.Feature.filter(
            name="lamindb.lightning", is_type=True
        ).one_or_none()
        self._auto_features.clear()
        if lightning_feature_type is not None:
            self._auto_features = {
                f.name: f
                for f in ln.Feature.filter(
                    name__in=_SUPPORTED_AUTO_FEATURES,
                    type=lightning_feature_type,
                )
            }

    def _discover_hparam_features(
        self, trainer: pl.Trainer, pl_module: pl.LightningModule
    ) -> None:
        """Find which hyperparameter names have matching Features in the DB."""
        hparam_names = self._collect_hparam_names(pl_module, trainer.datamodule)
        self._hparam_features_available = (
            set(ln.Feature.filter(name__in=hparam_names).values_list("name", flat=True))
            if hparam_names
            else set()
        )

    @staticmethod
    def _collect_hparam_names(*sources: Any) -> set[str]:
        """Gather hyperparameter names from one or more sources."""
        names: set[str] = set()
        for source in sources:
            if source is not None and hasattr(source, "hparams") and source.hparams:
                names.update(source.hparams.keys())
        return names

    def get(self, name: str) -> ln.Feature | None:
        """Return the typed auto-feature for *name*, or `None`."""
        return self._auto_features.get(name)

    def _set(self, target: dict[str | ln.Feature, Any], name: str, value: Any) -> None:
        """Add *value* to *target* if the auto-feature *name* is tracked and *value* is not `None`."""
        if (feature := self.get(name)) and value is not None:
            target[feature] = value

    def save_run_features(
        self,
        trainer: pl.Trainer,
        monitor: str | None,
        mode: str,
    ) -> None:
        """Collect and attach run-level features once per run.

        Idempotent — subsequent calls are no-ops.
        """
        if not ln.context.run or self._run_features_saved:
            return

        run_features = self._collect_run_features(trainer, monitor, mode)
        if run_features:
            ln.context.run.features.add_values(run_features)
        self._run_features_saved = True

    def _collect_run_features(
        self,
        trainer: pl.Trainer,
        monitor: str | None,
        mode: str,
    ) -> dict[str | ln.Feature, Any]:
        """Build the dict of run-level feature values (pure, no DB writes)."""
        run_features: dict[str | ln.Feature, Any] = {}

        if trainer.loggers:
            self._set(run_features, "logger_name", trainer.loggers[0].name)
            version = trainer.loggers[0].version
            self._set(
                run_features,
                "logger_version",
                version if isinstance(version, str) else f"version_{version}",
            )

        # Trainer config values
        self._add_trainer_config_features(run_features, trainer, monitor, mode)

        # Hyperparameters
        self._add_hparam_features(
            run_features, trainer.lightning_module, trainer.datamodule
        )

        return run_features

    def _add_trainer_config_features(
        self,
        target: dict[str | ln.Feature, Any],
        trainer: pl.Trainer,
        monitor: str | None,
        mode: str,
    ) -> None:
        """Append trainer configuration values to *target*."""
        self._set(target, "max_epochs", trainer.max_epochs)
        self._set(target, "max_steps", trainer.max_steps)
        self._set(target, "precision", str(trainer.precision))
        self._set(target, "accumulate_grad_batches", trainer.accumulate_grad_batches)
        self._set(target, "gradient_clip_val", trainer.gradient_clip_val)
        self._set(target, "monitor", monitor)
        self._set(target, "mode", mode)

    def _add_hparam_features(
        self,
        target: dict[str | ln.Feature, Any],
        *sources: Any,
    ) -> None:
        """Append hyperparameter values from one or more sources to *target*."""
        for source in sources:
            if source is None:
                continue
            if hasattr(source, "hparams") and source.hparams:
                for name, value in source.hparams.items():
                    if name in self._hparam_features_available:
                        target[name] = value

    def collect_checkpoint_features(
        self,
        trainer: pl.Trainer,
        is_best: bool,
        current_score: Any | None,
        save_weights_only: bool,
        monitor: str | None,
        mode: str,
    ) -> dict[str | ln.Feature, Any]:
        """Collect feature values for a checkpoint artifact.

        All `ModelCheckpoint` state is passed as explicit arguments so the
        annotator stays decoupled from the callback class hierarchy.

        Does **not** mutate existing artifacts — call
        :meth:`clear_best_model_flags` or :meth:`clear_last_model_flags`
        separately when needed.
        """
        feature_values: dict[str | ln.Feature, Any] = {}

        self._set(feature_values, "is_best_model", is_best)
        self._set(feature_values, "is_last_model", True)

        if current_score is not None:
            score = current_score
            if hasattr(score, "item"):
                score = score.item()
            self._set(feature_values, "score", float(score))
        self._set(feature_values, "save_weights_only", save_weights_only)
        self._set(feature_values, "monitor", monitor)
        self._set(feature_values, "mode", mode)

        # User-specified artifact features
        for name, value in self._artifact_features.items():
            if value is not None:
                feature_values[name] = value
            elif hasattr(trainer, name):
                feature_values[name] = getattr(trainer, name)
            elif name in trainer.callback_metrics:
                metric = trainer.callback_metrics[name]
                feature_values[name] = (
                    metric.item() if hasattr(metric, "item") else float(metric)
                )
        return feature_values

    def clear_best_model_flags(self, checkpoint_key_prefix: str) -> None:
        """Set `is_best_model=False` on previous best checkpoints."""
        self._clear_flagged_model_feature("is_best_model", checkpoint_key_prefix)

    def clear_last_model_flags(self, checkpoint_key_prefix: str) -> None:
        """Set `is_last_model=False` on previous latest checkpoints."""
        self._clear_flagged_model_feature("is_last_model", checkpoint_key_prefix)

    def _clear_flagged_model_feature(
        self,
        feature_name: Literal["is_best_model", "is_last_model"],
        checkpoint_key_prefix: str,
    ) -> None:
        """Set a boolean model flag to `False` on previously flagged checkpoints."""
        feature = self.get(feature_name)
        if feature is None:
            return
        feature_rows = self._get_artifact_feature_rows(
            {feature_name}, checkpoint_key_prefix
        )
        artifact_ids = [
            artifact_id
            for artifact_id, values in feature_rows.items()
            if values.get(feature_name) is True
        ]
        if not artifact_ids:
            return
        artifacts_by_id = {a.id: a for a in ln.Artifact.filter(id__in=artifact_ids)}
        for artifact_id in artifact_ids:
            if artifact_id not in artifacts_by_id:
                continue
            artifact = artifacts_by_id[artifact_id]
            artifact.features.remove_values(feature, value=True)
            artifact.features.add_values({feature: False})

    def update_model_ranks(self, checkpoint_key_prefix: str, mode: str) -> None:
        """Re-rank all checkpoint artifacts under *checkpoint_key_prefix*."""
        model_rank_feature = self.get("model_rank")
        if model_rank_feature is None:
            return
        feature_rows = self._get_artifact_feature_rows(
            {"score", "model_rank"}, checkpoint_key_prefix
        )
        scored = []
        for artifact_id, values in feature_rows.items():
            if "score" in values:
                scored.append((values["score"], values.get("model_rank"), artifact_id))
        scored.sort(key=lambda x: x[0], reverse=(mode == "max"))

        artifact_ids = [artifact_id for _, _, artifact_id in scored]
        artifacts_by_id = {a.id: a for a in ln.Artifact.filter(id__in=artifact_ids)}
        for rank, (_, old_rank, artifact_id) in enumerate(scored):
            if artifact_id not in artifacts_by_id:
                continue
            af = artifacts_by_id[artifact_id]
            if old_rank is not None:
                af.features.remove_values(model_rank_feature, value=old_rank)
            af.features.add_values({model_rank_feature: rank})

    def _get_artifact_feature_rows(
        self,
        feature_names: set[str],
        checkpoint_key_prefix: str,
    ) -> dict[int, dict[str, Any]]:
        """Query feature values for checkpoint artifacts under *checkpoint_key_prefix*.

        Returns a dict keyed by artifact ID, where each value is a dict mapping
        feature name to its stored value.  Example::

            {
                42: {"score": 0.95, "is_best_model": True},
                71: {"score": 0.87, "is_best_model": False, "model_rank": 1},
            }
        """
        feature_ids = [
            feature.id for name in feature_names if (feature := self.get(name))
        ]
        key_startswith = checkpoint_key_prefix + "/"
        if feature_ids:
            rows = ln.models.ArtifactJsonValue.filter(
                artifact__key__startswith=key_startswith,
                jsonvalue__feature_id__in=feature_ids,
            ).values_list("artifact_id", "jsonvalue__feature__name", "jsonvalue__value")
        else:
            rows = ln.models.ArtifactJsonValue.filter(
                artifact__key__startswith=key_startswith,
                jsonvalue__feature__name__in=feature_names,
            ).values_list("artifact_id", "jsonvalue__feature__name", "jsonvalue__value")
        result: dict[int, dict[str, Any]] = {}
        for artifact_id, feature_name, value in rows:
            if artifact_id not in result:
                result[artifact_id] = {}
            result[artifact_id][feature_name] = value
        return result


class ArtifactPublishingModelCheckpoint(ModelCheckpoint):
    """ModelCheckpoint with observable artifact lifecycle hooks.

    This layer captures artifact kinds, observer registration, saved/removed
        events, latest artifact tracking, and key compatibility hooks. Concrete
        subclasses remain responsible for how artifacts are persisted.

        Subclasses are expected to implement:

        - :meth:`resolve_artifact_key` to map local files to logical artifact keys
        - :meth:`resolve_artifact_storage_uri` to expose a stable backend URI
        - :meth:`save_checkpoint_artifact`, :meth:`save_config_artifact`, and
            :meth:`save_hparams_artifact` to persist files

        :class:`SaveConfigCallback` only depends on this base class, which means a
        custom checkpoint callback can participate in config saving without inheriting
        from Lamin's concrete :class:`Checkpoint`.
    """

    def __init__(
        self,
        *args: Any,
        artifact_observers: list[ArtifactObserver] | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(*args, **kwargs)
        self._artifact_observers: list[ArtifactObserver] = list(
            artifact_observers or []
        )
        self._latest_artifacts: dict[ArtifactKind, Any | None] = {
            "checkpoint": None,
            "config": None,
            "hparams": None,
        }
        self._last_artifact_event: ArtifactSavedEvent | ArtifactRemovedEvent | None = (
            None
        )

    @property
    def last_checkpoint_artifact(self) -> Any | None:
        """The most recently saved checkpoint artifact handle."""
        return self._latest_artifacts["checkpoint"]

    @property
    def last_config_artifact(self) -> Any | None:
        """The most recently saved config artifact handle."""
        return self._latest_artifacts["config"]

    @property
    def last_hparams_artifact(self) -> Any | None:
        """The most recently saved hparams artifact handle."""
        return self._latest_artifacts["hparams"]

    @property
    def last_artifact_event(self) -> ArtifactSavedEvent | ArtifactRemovedEvent | None:
        """The last artifact lifecycle event emitted by this callback."""
        return self._last_artifact_event

    def get_last_artifact(self, kind: ArtifactKind) -> Any | None:
        """Return the most recently saved artifact for a given artifact kind."""
        return self._latest_artifacts[kind]

    def add_artifact_observer(self, observer: ArtifactObserver) -> None:
        """Register an observer notified about artifact lifecycle events."""
        self._artifact_observers.append(observer)

    def remove_artifact_observer(self, observer: ArtifactObserver) -> None:
        """Unregister a previously added artifact observer."""
        self._artifact_observers.remove(observer)

    def resolve_artifact_storage_uri(self, artifact: Any) -> str:
        """Resolve the physical location for a persisted artifact."""
        raise NotImplementedError

    def resolve_artifact_key(
        self,
        trainer: pl.Trainer,
        filepath: Path | str,
        kind: ArtifactKind,
    ) -> str:
        """Return the logical artifact key for a checkpoint-related file."""
        raise NotImplementedError

    def _notify_artifact_saved(
        self,
        trainer: pl.Trainer,
        *,
        kind: ArtifactKind,
        key: str,
        artifact: Any,
        local_path: Path | str,
    ) -> ArtifactSavedEvent:
        event = ArtifactSavedEvent(
            kind=kind,
            key=key,
            local_path=Path(local_path),
            trainer=trainer,
            artifact=artifact,
            storage_uri=self.resolve_artifact_storage_uri(artifact),
        )
        self._latest_artifacts[kind] = artifact
        self._last_artifact_event = event
        self.on_artifact_saved(event)
        self._notify_artifact_observers("on_artifact_saved", event)
        return event

    def _notify_artifact_removed(
        self,
        trainer: pl.Trainer,
        *,
        kind: ArtifactKind,
        key: str,
        local_path: Path | str,
        artifact: Any | None,
    ) -> ArtifactRemovedEvent:
        storage_uri = None
        if artifact is not None:
            storage_uri = self.resolve_artifact_storage_uri(artifact)
        event = ArtifactRemovedEvent(
            kind=kind,
            key=key,
            local_path=Path(local_path),
            trainer=trainer,
            artifact=artifact,
            storage_uri=storage_uri,
        )
        self._last_artifact_event = event
        self.on_artifact_removed(event)
        self._notify_artifact_observers("on_artifact_removed", event)
        return event

    def _notify_artifact_observers(
        self,
        method_name: str,
        event: ArtifactSavedEvent | ArtifactRemovedEvent,
    ) -> None:
        for observer in tuple(self._artifact_observers):
            method = getattr(observer, method_name, None)
            if callable(method):
                method(event)

    def on_artifact_saved(self, event: ArtifactSavedEvent) -> None:
        """Hook for subclasses after an artifact has been saved."""
        del event

    def on_artifact_removed(self, event: ArtifactRemovedEvent) -> None:
        """Hook for subclasses after a checkpoint file has been removed."""
        del event

    def save_checkpoint_artifact(
        self,
        trainer: pl.Trainer,
        filepath: Path | str,
        *,
        feature_values: dict[str, Any] | None = None,
    ) -> Any:
        """Persist a checkpoint artifact and emit the corresponding event."""
        del trainer, filepath, feature_values
        raise NotImplementedError

    def save_config_artifact(self, trainer: pl.Trainer, config_path: Path | str) -> Any:
        """Persist a config artifact and emit the corresponding event."""
        del trainer, config_path
        raise NotImplementedError

    def save_hparams_artifact(
        self, trainer: pl.Trainer, hparams_path: Path | str
    ) -> Any | None:
        """Persist an hparams artifact and emit the corresponding event."""
        del trainer, hparams_path
        raise NotImplementedError


class Checkpoint(ArtifactPublishingModelCheckpoint):
    """A `ModelCheckpoint` that annotates `pytorch` `lightning` checkpoints.

    Extends `lightning`'s `ModelCheckpoint` with artifact creation & feature annotation.
    Each checkpoint is a separate artifact whose key is derived from either the
    explicit `dirpath` or the trainer's logger configuration.

    When `dirpath` is omitted (recommended), Lightning decides where to store
    checkpoints locally (typically `lightning_logs/version_N/checkpoints/`)
    and the artifact key is derived from the logger's `save_dir`, `name`,
    and `version`.  When `dirpath` is provided, it is used directly as the
    key prefix.

    All artifacts are scoped under a single **base prefix**.  Checkpoints
    (and `hparams.yaml`) live under `{base}/checkpoints/`; other artifacts
    (e.g. `config.yaml`) live directly under `{base}/`.

    Base prefix derivation (highest priority first):

    1. `dirpath` provided → `{dirpath}` (logger is ignored for key purposes)
    2. `dirpath` omitted, logger present → `{save_dir_basename}/{name}/{version}`
    3. `dirpath` omitted, no logger → empty

    When `run_uid_is_version` is `True` (the default) and a Lamin run context
    is active, the run UID is incorporated into the base prefix:

    - Case 1/3: the run UID is appended as an extra path segment
      (e.g. `my/dir/{run_uid}`, or just `{run_uid}`).
    - Case 2: the logger's auto-incremented `version` is *replaced* by the
      run UID (`{save_dir_basename}/{name}/{run_uid}`).

    Resulting key layout (with run UID active)::

        {base}/checkpoints/epoch=0-step=100.ckpt
        {base}/checkpoints/hparams.yaml
        {base}/config.yaml

    If available in the database through `save_lightning_features()`, the following `lamindb.lightning` features are automatically tracked:

    - Artifact-level: `is_best_model`, `is_last_model`, `score`, `model_rank`, `save_weights_only`, `monitor`, `mode`
    - Run-level: `logger_name`, `logger_version`, `max_epochs`, `max_steps`, `precision`, `accumulate_grad_batches`, `gradient_clip_val`, `monitor`, `mode`

    Additionally, model hyperparameters (from `pl_module.hparams`) and datamodule hyperparameters
    (from `trainer.datamodule.hparams`) are captured if corresponding features exist.

    This is the concrete LaminDB implementation built on top of
    :class:`ArtifactPublishingModelCheckpoint`. Use it when you want LaminDB to be
    the persistence layer. For secondary systems such as ClearML, prefer attaching
    an :class:`ArtifactObserver` or subclassing :class:`Checkpoint` and reacting in
    :meth:`on_artifact_saved`.

    Args:
        dirpath: Directory for checkpoints.  When provided, also used as the
            artifact key prefix.  When omitted (recommended), Lightning picks
            the local directory and the key prefix is derived from the logger.
        features: Features to annotate runs and artifacts.
            Use "run" key for run-level features (static metadata).
            Use "artifact" key for artifact-level features (values can be static or None for auto-population from trainer metrics/attributes).
        monitor: Quantity to monitor for saving best checkpoint.
        verbose: Verbosity mode.
        save_last: Save a copy of the last checkpoint.
        save_top_k: Number of best checkpoints to keep.
        save_weights_only: Save only model weights (not optimizer state).
        mode: One of "min" or "max" for monitor comparison.
        auto_insert_metric_name: Include metric name in checkpoint filename.
        every_n_train_steps: Checkpoint every N training steps.
        train_time_interval: Checkpoint at time intervals.
        every_n_epochs: Checkpoint every N epochs.
        save_on_train_epoch_end: Run checkpointing at end of training epoch.
        enable_version_counter: Append version to filename to avoid collisions.
        run_uid_is_version: When `True` (default) and a Lamin run context is
            active, incorporate the run UID into the base prefix. For the
            logger case the logger's auto-incremented version is replaced;
            for the dirpath and no-logger cases the run UID is appended as
            an extra path segment. Prevents cross-run key collisions.
        artifact_observers: Optional observer objects notified when checkpoint,
            config, or hparams artifacts are saved or when checkpoint files are
            removed locally. Observers follow :class:`ArtifactObserver` and
            receive :class:`ArtifactSavedEvent` and :class:`ArtifactRemovedEvent`.

    Examples:

        Let Lightning decide where to store checkpoints (recommended)::

            import lightning as pl
            from lightning.pytorch.loggers import CSVLogger
            from lamindb.integrations import lightning as ll

            ll.save_lightning_features()

            callback = ll.Checkpoint(monitor="val_loss", save_top_k=3)
            logger = CSVLogger(save_dir="logs")

            trainer = pl.Trainer(callbacks=[callback], logger=logger)
            trainer.fit(model, dataloader)

            # Query checkpoints — key prefix is derived from the logger
            # e.g. "logs/lightning_logs/version_0/checkpoints/"
            ln.Artifact.filter(key__startswith=callback.checkpoint_key_prefix)

        Explicit `dirpath` for full control over the artifact key prefix::

            callback = ll.Checkpoint(
                dirpath="deployments/my_model/",
                monitor="val_loss",
                save_top_k=3,
            )

            trainer = pl.Trainer(callbacks=[callback])
            trainer.fit(model, dataloader)

            # Query checkpoints
            ln.Artifact.filter(key__startswith=callback.checkpoint_key_prefix)

        Using the CLI::

            # config.yaml
            trainer:
              callbacks:
                - class_path: lamindb.integrations.lightning.Checkpoint
                  init_args:
                    monitor: val_loss
                    save_top_k: 3

            # Run with:
            # python main.py fit --config config.yaml

        For more, see the guide: :doc:`lightning`.
    """

    def __init__(
        self,
        dirpath: _PATH | None = None,
        *,
        features: dict[Literal["run", "artifact"], dict[str, Any]] | None = None,
        monitor: str | None = None,
        verbose: bool = False,
        save_last: bool | None = None,
        save_top_k: int = 1,
        save_weights_only: bool = False,
        mode: Literal["min", "max"] = "min",
        auto_insert_metric_name: bool = True,
        every_n_train_steps: int | None = None,
        train_time_interval: timedelta | None = None,
        every_n_epochs: int | None = None,
        save_on_train_epoch_end: bool | None = None,
        enable_version_counter: bool = True,
        run_uid_is_version: bool = True,
        artifact_observers: list[ArtifactObserver] | None = None,
    ) -> None:
        self._original_dirpath = dirpath
        super().__init__(
            dirpath=dirpath,
            monitor=monitor,
            verbose=verbose,
            save_last=save_last,
            save_top_k=save_top_k,
            save_weights_only=save_weights_only,
            mode=mode,
            auto_insert_metric_name=auto_insert_metric_name,
            every_n_train_steps=every_n_train_steps,
            train_time_interval=train_time_interval,
            every_n_epochs=every_n_epochs,
            save_on_train_epoch_end=save_on_train_epoch_end,
            enable_version_counter=enable_version_counter,
            artifact_observers=artifact_observers,
        )
        self._feature_annotator = FeatureAnnotator(features)
        self._hparams_yaml_saved = False
        self._run_uid_is_version = run_uid_is_version
        self._trainer: pl.Trainer | None = None
        self._artifact_publisher: ArtifactPublisher = LaminArtifactPublisher()

    def setup(
        self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str
    ) -> None:
        """Validate user features and detect available auto-features."""
        super().setup(trainer, pl_module, stage)
        self._trainer = trainer

        if self.save_last:
            warnings.warn(
                "save_last is not necessary with Lamin. Checkpoint metadata"
                " (is_best_model, is_last_model, model_rank, score) makes the latest checkpoint"
                " queryable without encoding this in the filename. Consider"
                " disabling save_last to avoid redundant checkpoint copies.",
                UserWarning,
                stacklevel=2,
            )

        if trainer.is_global_zero:
            self._feature_annotator.setup(trainer, pl_module)

    def _base_prefix(self, trainer: pl.Trainer) -> str:
        """Compute the base artifact key prefix.

        The base prefix is the root namespace for all artifacts produced by
        this callback.  Checkpoints live under `{base}/checkpoints/` and
        other files (config, hparams) directly under `{base}/`.

        Priority: explicit `dirpath` > logger > run UID > empty.
        """
        run_uid = self._active_run_uid()
        if self._original_dirpath is not None:
            prefix = str(self._original_dirpath).rstrip("/")
            return f"{prefix}/{run_uid}" if run_uid else prefix
        if len(trainer.loggers) > 0:
            return self._logger_prefix(trainer, run_uid)
        return run_uid or ""

    def _active_run_uid(self) -> str | None:
        """Return the Lamin run UID when run-UID scoping is active."""
        if self._run_uid_is_version and ln.context.run is not None:
            return ln.context.run.uid
        return None

    def _logger_prefix(self, trainer: pl.Trainer, run_uid: str | None) -> str:
        """Derive a key prefix from the trainer's first logger."""
        assert trainer.loggers, "_logger_prefix requires at least one logger"
        logger = trainer.loggers[0]
        save_dir = logger.save_dir or trainer.default_root_dir
        name = str(logger.name).rstrip("/")
        if run_uid:
            version = run_uid
        else:
            version = logger.version
            version = version if isinstance(version, str) else f"version_{version}"
        return f"{Path(save_dir).name}/{name}/{version.rstrip('/')}"

    @property
    def base_prefix(self) -> str:
        """The base artifact key prefix for all artifacts from this callback.

        Checkpoints live under `{base_prefix}/checkpoints/` and configs
        directly under `{base_prefix}/`.

        Available after `setup()` has been called.
        """
        assert self._trainer is not None, "base_prefix is only available after setup()"
        return self._base_prefix(self._trainer)

    @property
    def checkpoint_key_prefix(self) -> str:
        """The artifact key prefix used for checkpoint artifacts.

        Available after `setup()` has been called, for example once
        `trainer.fit()` has started.
        """
        base = self.base_prefix
        return f"{base}/checkpoints" if base else "checkpoints"

    def resolve_artifact_storage_uri(self, artifact: ln.Artifact) -> str:
        """Resolve the physical artifact location for downstream registries.

        This is the stable abstraction external packages should use instead of
        reconstructing storage locations from Lamin internals.
        """
        return self._artifact_publisher.storage_uri(artifact)

    def resolve_artifact_key(
        self,
        trainer: pl.Trainer,
        filepath: Path | str,
        kind: ArtifactKind,
    ) -> str:
        """Return the Lamin artifact key for a checkpoint-related file."""
        base = self._base_prefix(trainer)
        if kind in {"checkpoint", "hparams"}:
            prefix = f"{base}/checkpoints" if base else "checkpoints"
        else:
            prefix = base
        if prefix:
            return f"{prefix}/{Path(filepath).name}"
        return Path(filepath).name

    def _create_lamin_artifact(
        self,
        local_path: Path | str,
        *,
        key: str,
        description: str,
        kind: str | None = None,
        add_as_input_to_run: bool = False,
        skip_hash_lookup: bool = False,
    ) -> ln.Artifact:
        return self._artifact_publisher.create_artifact(
            local_path,
            key=key,
            description=description,
            kind=kind,
            add_as_input_to_run=add_as_input_to_run,
            skip_hash_lookup=skip_hash_lookup,
        )
        self._feature_annotator.clear_last_model_flags(self.checkpoint_key_prefix)

    def save_checkpoint_artifact(
        self,
        trainer: pl.Trainer,
        filepath: Path | str,
        *,
        feature_values: dict[str | ln.Feature, Any] | None = None,
    ) -> ln.Artifact:
        """Save a checkpoint artifact to Lamin and emit the corresponding event.

        This is the main persistence hook used by :meth:`_save_checkpoint`. It is a
        useful override point for subclasses that want to augment Lamin persistence
        while keeping the generic lifecycle behavior from the base class.
        """
        key = self.resolve_artifact_key(
            trainer=trainer, filepath=filepath, kind="checkpoint"
        )
        existing_artifact = ln.Artifact.filter(key=key).one_or_none()
        if existing_artifact is not None:
            existing_artifact.delete(permanent=True, storage=True)
        artifact = self._create_lamin_artifact(
            filepath,
            key=key,
            description="model checkpoint",
            kind="model",
            skip_hash_lookup=True,
        )
        if feature_values:
            artifact.features.add_values(feature_values)
        self._notify_artifact_saved(
            trainer,
            kind="checkpoint",
            key=key,
            artifact=artifact,
            local_path=filepath,
        )
        return artifact

    def save_config_artifact(
        self, trainer: pl.Trainer, config_path: Path | str
    ) -> ln.Artifact:
        """Save a Lightning CLI config artifact and emit the corresponding event.

        Config artifacts are routed through the same lifecycle surface as
        checkpoints so observers and subclasses see a unified event stream.
        """
        key = self.resolve_artifact_key(
            trainer=trainer, filepath=config_path, kind="config"
        )
        artifact = self._create_lamin_artifact(
            config_path,
            key=key,
            description="Lightning CLI config",
            kind="config",
            add_as_input_to_run=True,
            skip_hash_lookup=True,
        )
        self._notify_artifact_saved(
            trainer,
            kind="config",
            key=key,
            artifact=artifact,
            local_path=config_path,
        )
        return artifact

    def save_hparams_artifact(
        self, trainer: pl.Trainer, hparams_path: Path | str
    ) -> ln.Artifact | None:
        """Save Lightning's auto-generated hparams file and emit the event.

        Returns `None` if Lightning did not generate `hparams.yaml` for the
        current run.
        """
        if not Path(hparams_path).exists():
            return None

        key = self.resolve_artifact_key(
            trainer=trainer, filepath=hparams_path, kind="hparams"
        )
        artifact = self._create_lamin_artifact(
            hparams_path,
            key=key,
            description="Lightning run hyperparameters",
            kind="config",
            skip_hash_lookup=True,
        )
        self._notify_artifact_saved(
            trainer,
            kind="hparams",
            key=key,
            artifact=artifact,
            local_path=hparams_path,
        )
        return artifact

    def _save_hparams_yaml(self, trainer: pl.Trainer) -> None:
        """Persist Lightning's auto-generated hparams file once per run."""
        if self._hparams_yaml_saved:
            return

        log_dir = trainer.log_dir
        if not log_dir:
            return

        hparams_path = Path(log_dir) / "hparams.yaml"
        if not hparams_path.exists():
            return

        if self.save_hparams_artifact(trainer, hparams_path) is not None:
            self._hparams_yaml_saved = True

    def _save_checkpoint(self, trainer: pl.Trainer, filepath: str) -> None:
        """Save checkpoint to the instance."""
        super()._save_checkpoint(trainer, filepath)

        if not trainer.is_global_zero:
            return
        self._save_hparams_yaml(trainer)

        self._feature_annotator.save_run_features(
            trainer, monitor=self.monitor, mode=self.mode
        )
        self._feature_annotator.clear_last_model_flags(self.checkpoint_key_prefix)
        is_best = self.best_model_path == str(filepath)
        feature_values = self._feature_annotator.collect_checkpoint_features(
            trainer,
            is_best=is_best,
            current_score=self.current_score,
            save_weights_only=self.save_weights_only,
            monitor=self.monitor,
            mode=self.mode,
        )

        if is_best:
            self._feature_annotator.clear_best_model_flags(self.checkpoint_key_prefix)

        self.save_checkpoint_artifact(trainer, filepath, feature_values=feature_values)

        self._feature_annotator.update_model_ranks(
            self.checkpoint_key_prefix, mode=self.mode
        )

    def _remove_checkpoint(self, trainer: pl.Trainer, filepath: str) -> None:
        """Remove the local checkpoint file and emit a removal event."""
        artifact: ln.Artifact | None = None
        key = self.resolve_artifact_key(
            trainer=trainer, filepath=filepath, kind="checkpoint"
        )
        if trainer.is_global_zero:
            artifact = ln.Artifact.filter(key=key).one_or_none()
        super()._remove_checkpoint(trainer, filepath)
        if trainer.is_global_zero:
            self._notify_artifact_removed(
                trainer,
                kind="checkpoint",
                key=key,
                local_path=filepath,
                artifact=artifact,
            )
            if artifact is not None:
                artifact.delete(permanent=True, storage=True)


class SaveConfigCallback(_SaveConfigCallback):
    """SaveConfigCallback that also saves config to the instance.

    Use with LightningCLI to save the resolved configuration file alongside checkpoints.

    The local config file is saved under `{save_dir}/{name}/{version}/`
    derived from the first logger, avoiding Lightning's `trainer.log_dir`
    which hardcodes an `isinstance` check for `TensorBoardLogger` /
    `CSVLogger` and silently changes the directory for other loggers.

    This callback looks for any :class:`ArtifactPublishingModelCheckpoint`, not just
    Lamin's concrete :class:`Checkpoint`. That keeps the config-save path aligned
    with custom subclasses built on the generic artifact-publishing base.

    Config artifacts are stored directly under the **base prefix** of the
    active :class:`Checkpoint` callback.  The base prefix follows the same
    derivation rules as for checkpoints (dirpath > logger > empty), so
    configs are always co-located with their checkpoints:

    - `Checkpoint.dirpath` set → `{dirpath}/config.yaml`
      (`{dirpath}/{run_uid}/config.yaml` with run-UID scoping)
    - Logger present, no `dirpath` → `{save_dir_basename}/{name}/{version}/config.yaml`
    - Neither → `config.yaml` (or `{run_uid}/config.yaml` with run-UID scoping)

    Example::

        from lightning.pytorch.cli import LightningCLI
        from lamindb.integrations import lightning as ll

        cli = LightningCLI(
            MyModel,
            MyDataModule,
            save_config_callback=ll.SaveConfigCallback,
        )
    """

    def setup(
        self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str
    ) -> None:
        """Save resolved configuration file alongside checkpoints."""
        if self.already_saved:  # type: ignore
            return

        if self.save_to_log_dir:
            config_path = self._config_path(trainer)

            if not self.overwrite:
                file_exists = config_path.exists() if trainer.is_global_zero else False
                file_exists = trainer.strategy.broadcast(file_exists)
                if file_exists:
                    raise RuntimeError(f"Config file already exists: {config_path}")

            if trainer.is_global_zero:
                config_path.parent.mkdir(exist_ok=True, parents=True)
                self.parser.save(
                    self.config,
                    config_path,
                    skip_none=False,
                    overwrite=self.overwrite,
                    multifile=self.multifile,
                )
                self._save_config(trainer, config_path)

            if trainer.is_global_zero:
                self.save_config(trainer, pl_module, stage)
                self.already_saved = True
            self.already_saved = trainer.strategy.broadcast(self.already_saved)

    def _config_path(self, trainer: pl.Trainer) -> Path:
        """Derive the local config file path from the first logger.

        We intentionally avoid `trainer.log_dir` because Lightning hardcodes
        an `isinstance` check against `TensorBoardLogger` and `CSVLogger`
        there.  For those two loggers it uses `logger.log_dir` (which appends
        name/version), while for every other logger it falls back to
        `logger.save_dir` (no name/version).  This means the config file
        location silently changes depending on which logger happens to be first
        — making it unpredictable for third-party loggers.

        This method always uses `logger.save_dir` + `name` + `version`,
        giving a consistent directory layout regardless of logger type.
        """
        if len(trainer.loggers) > 0:
            first = trainer.loggers[0]
            save_dir = (
                first.save_dir
                if first.save_dir is not None
                else trainer.default_root_dir
            )
            name = first.name
            version = first.version
            version = version if isinstance(version, str) else f"version_{version}"
            return Path(save_dir) / str(name) / version / self.config_filename
        return Path(trainer.default_root_dir) / self.config_filename

    def _save_config(self, trainer: pl.Trainer, config_path: Path) -> None:
        """Persist the resolved config through the active artifact checkpoint.

        If no artifact-publishing checkpoint callback is registered, this becomes a
        no-op and only Lightning's local config file is written.
        """
        checkpoint_cb = self._get_artifact_checkpoint_callback(trainer)
        if checkpoint_cb is None:
            return

        checkpoint_cb.save_config_artifact(trainer, config_path)

    def _get_artifact_checkpoint_callback(
        self, trainer: pl.Trainer
    ) -> ArtifactPublishingModelCheckpoint | None:
        """Find the artifact-publishing checkpoint callback if present."""
        for cb in trainer.callbacks:
            if isinstance(cb, ArtifactPublishingModelCheckpoint):
                return cb
        return None


# backwards compatibility
# We keep the full class around because it's short and it's cumbersome to write
# full backwards compatibility code because of the rather different interfaces and behavior
class Callback(pl.Callback):
    """Saves checkpoints to LaminDB after each training epoch.

    .. deprecated::
        Use :class:`Checkpoint` instead for new code.

    Args:
        path: A local path to the checkpoint.
        key: The `key` for the checkpoint artifact.
        features: Features to annotate the checkpoint.
    """

    def __init__(
        self,
        path: str | Path,
        key: str,
        features: dict[str, Any] | None = None,
    ):
        warnings.warn(
            "ll.Callback is deprecated, use ll.Checkpoint instead",
            DeprecationWarning,
            stacklevel=2,
        )
        self.path = Path(path)
        self.key = key
        self.features = features or {}

    def on_train_start(
        self, trainer: pl.Trainer, pl_module: pl.LightningModule
    ) -> None:
        """Validates that features exist for all specified params."""
        missing = [
            name
            for name in self.features
            if ln.Feature.filter(name=name).one_or_none() is None
        ]
        if missing:
            s = "s" if len(missing) > 1 else ""
            raise ValueError(
                f"Feature{s} {', '.join(missing)} missing. "
                f"Create {'them' if len(missing) > 1 else 'it'} first."
            )

    def on_train_epoch_end(
        self, trainer: pl.Trainer, pl_module: pl.LightningModule
    ) -> None:
        """Saves model checkpoint at the end of each epoch."""
        trainer.save_checkpoint(self.path)
        artifact = ln.Artifact(self.path, key=self.key, kind="model").save()

        feature_values = dict(self.features)
        for name in self.features:
            if hasattr(trainer, name):
                feature_values[name] = getattr(trainer, name)
            elif name in trainer.callback_metrics:
                metric = trainer.callback_metrics[name]
                feature_values[name] = (
                    metric.item() if hasattr(metric, "item") else float(metric)
                )

        if feature_values:
            artifact.features.add_values(feature_values)


__all__ = [
    "ArtifactObserver",
    "ArtifactEvent",
    "ArtifactPublisher",
    "ArtifactPublishingModelCheckpoint",
    "ArtifactRemovedEvent",
    "ArtifactSavedEvent",
    "Checkpoint",
    "LaminArtifactPublisher",
    "SaveConfigCallback",
    "save_lightning_features",
]


================================================
FILE: lamindb/migrations/0177_squashed.py
================================================
# Generated by Django 5.2 on 2026-01-10 23:06

import django.core.validators
import django.db.models.deletion
import django.db.models.functions.datetime
import django.db.models.functions.text
import pgtrigger.compiler
import pgtrigger.migrations
from django.db import connection, migrations, models

import lamindb.base.fields
import lamindb.base.uids
import lamindb.base.users
import lamindb.models.can_curate
import lamindb.models.has_parents
import lamindb.models.run
import lamindb.models.sqlrecord

CREATE_IS_VALID_RECORD_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_record_type(record_type_id INTEGER, record_is_type BOOLEAN)
RETURNS BOOLEAN AS $$
BEGIN
    -- Record with no type is valid
    IF record_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    -- If current record is a type, it can only reference schema-less types
    IF record_is_type THEN
        RETURN EXISTS (
            SELECT 1 FROM lamindb_record r
            WHERE r.id = record_type_id AND r.is_type AND r.schema_id IS NULL
        );
    END IF;

    -- Regular records can reference any type
    RETURN EXISTS (
        SELECT 1 FROM lamindb_record r
        WHERE r.id = record_type_id AND r.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_RECORD_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_record
ADD CONSTRAINT record_type_is_valid_fk
CHECK (is_valid_record_type(type_id, is_type));
"""


CREATE_IS_VALID_FEATURE_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_feature_type(feature_type_id INTEGER)
RETURNS BOOLEAN AS $$
BEGIN
    -- Feature with no type is valid
    IF feature_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    -- Type must have is_type = TRUE
    RETURN EXISTS (
        SELECT 1 FROM lamindb_feature f
        WHERE f.id = feature_type_id AND f.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_feature
ADD CONSTRAINT feature_type_is_valid_fk
CHECK (is_valid_feature_type(type_id));
"""


CREATE_IS_VALID_SCHEMA_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_schema_type(schema_type_id INTEGER)
RETURNS BOOLEAN AS $$
BEGIN
    IF schema_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    RETURN EXISTS (
        SELECT 1 FROM lamindb_schema s
        WHERE s.id = schema_type_id AND s.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_schema
ADD CONSTRAINT schema_type_is_valid_fk
CHECK (is_valid_schema_type(type_id));
"""


CREATE_IS_VALID_PROJECT_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_project_type(project_type_id INTEGER)
RETURNS BOOLEAN AS $$
BEGIN
    IF project_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    RETURN EXISTS (
        SELECT 1 FROM lamindb_project p
        WHERE p.id = project_type_id AND p.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_project
ADD CONSTRAINT project_type_is_valid_fk
CHECK (is_valid_project_type(type_id));
"""


CREATE_IS_VALID_REFERENCE_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_reference_type(reference_type_id INTEGER)
RETURNS BOOLEAN AS $$
BEGIN
    IF reference_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    RETURN EXISTS (
        SELECT 1 FROM lamindb_reference r
        WHERE r.id = reference_type_id AND r.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_reference
ADD CONSTRAINT reference_type_is_valid_fk
CHECK (is_valid_reference_type(type_id));
"""


CREATE_IS_VALID_ULABEL_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_ulabel_type(ulabel_type_id INTEGER)
RETURNS BOOLEAN AS $$
BEGIN
    IF ulabel_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    RETURN EXISTS (
        SELECT 1 FROM lamindb_ulabel u
        WHERE u.id = ulabel_type_id AND u.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_ulabel
ADD CONSTRAINT ulabel_type_is_valid_fk
CHECK (is_valid_ulabel_type(type_id));
"""


def apply_constraints(apps, schema_editor):
    if schema_editor.connection.vendor == "postgresql":
        schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_CONSTRAINT)
        schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT)
        schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT)
        schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT)
        schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT)
        schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT)


class Migration(migrations.Migration):
    initial = True
    dependencies = []  # type: ignore
    operations = [
        migrations.CreateModel(
            name="Migration",
            fields=[
                (
                    "id",
                    models.BigAutoField(
                        auto_created=True,
                        primary_key=True,
                        serialize=False,
                        verbose_name="ID",
                    ),
                ),
                (
                    "app",
                    lamindb.base.fields.CharField(
                        blank=True, default=None, max_length=255
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True, default=None, max_length=255
                    ),
                ),
                ("applied", lamindb.base.fields.DateTimeField(blank=True)),
            ],
            options={
                "db_table": "django_migrations",
                "managed": False,
            },
        ),
        migrations.CreateModel(
            name="Block",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                ("key", models.CharField(db_index=True, max_length=1024)),
            ],
        ),
        migrations.CreateModel(
            name="Branch",
            fields=[
                ("id", models.AutoField(primary_key=True, serialize=False)),
                ("name", models.CharField(db_index=True, max_length=100)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
            ],
        ),
        migrations.CreateModel(
            name="Space",
            fields=[
                ("id", models.SmallAutoField(primary_key=True, serialize=False)),
                ("name", models.CharField(db_index=True, max_length=100)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
            ],
        ),
        migrations.CreateModel(
            name="Artifact",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                (
                    "key",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=1024,
                        null=True,
                    ),
                ),
                (
                    "_real_key",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=1024,
                        null=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(
                        blank=True, db_index=True, default=None, null=True
                    ),
                ),
                (
                    "suffix",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=30,
                    ),
                ),
                (
                    "kind",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=20,
                        null=True,
                    ),
                ),
                (
                    "otype",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=64,
                        null=True,
                    ),
                ),
                (
                    "size",
                    lamindb.base.fields.BigIntegerField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        null=True,
                    ),
                ),
                (
                    "hash",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=22,
                        null=True,
                    ),
                ),
                (
                    "n_files",
                    lamindb.base.fields.BigIntegerField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        null=True,
                    ),
                ),
                (
                    "n_observations",
                    lamindb.base.fields.BigIntegerField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        null=True,
                    ),
                ),
                (
                    "_hash_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "_key_is_virtual",
                    lamindb.base.fields.BooleanField(blank=True, default=None),
                ),
                (
                    "_overwrite_versions",
                    lamindb.base.fields.BooleanField(blank=True, default=None),
                ),
                (
                    "_actions",
                    models.ManyToManyField(
                        related_name="_action_targets", to="lamindb.artifact"
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
        ),
        migrations.CreateModel(
            name="ArtifactArtifact",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_artifact",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "value",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_value",
                        to="lamindb.artifact",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="artifact",
            name="artifacts",
            field=models.ManyToManyField(
                related_name="linked_by_artifacts",
                through="lamindb.ArtifactArtifact",
                to="lamindb.artifact",
            ),
        ),
        migrations.CreateModel(
            name="BlockProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "block",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.block",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="block",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_column="branch_id",
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_column="branch_id",
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.CreateModel(
            name="Collection",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_20,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                (
                    "key",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=255
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "hash",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=22,
                        null=True,
                    ),
                ),
                (
                    "reference",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "reference_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=25,
                        null=True,
                    ),
                ),
                (
                    "_actions",
                    models.ManyToManyField(related_name="+", to="lamindb.artifact"),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_column="branch_id",
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "meta_artifact",
                    lamindb.base.fields.OneToOneField(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="_meta_of_collection",
                        to="lamindb.artifact",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
        ),
        migrations.CreateModel(
            name="CollectionArtifact",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_collection",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "collection",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_artifact",
                        to="lamindb.collection",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="collection",
            name="artifacts",
            field=models.ManyToManyField(
                related_name="collections",
                through="lamindb.CollectionArtifact",
                to="lamindb.artifact",
            ),
        ),
        migrations.CreateModel(
            name="CollectionProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "collection",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.collection",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="CollectionReference",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "collection",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_reference",
                        to="lamindb.collection",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="Feature",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=150
                    ),
                ),
                (
                    "_dtype_str",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "unit",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                ("array_rank", models.SmallIntegerField(db_index=True, default=0)),
                ("array_size", models.IntegerField(db_index=True, default=0)),
                (
                    "array_shape",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "synonyms",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "default_value",
                    lamindb.base.fields.JSONField(blank=True, default=None, null=True),
                ),
                (
                    "nullable",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "coerce",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_column="branch_id",
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="features",
                        to="lamindb.feature",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(lamindb.models.can_curate.CanCurate, models.Model),
        ),
        migrations.CreateModel(
            name="CollectionRecord",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "collection",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_record",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_collectionrecord",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="ArtifactRun",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_run",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactrun",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="ArtifactReference",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_reference",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactreference",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="ArtifactRecord",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_record",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactrecord",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="ArtifactProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactproject",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="artifactartifact",
            name="feature",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifactartifact",
                to="lamindb.feature",
            ),
        ),
        migrations.CreateModel(
            name="FeatureProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="JsonValue",
            fields=[
                (
                    "id",
                    models.BigAutoField(
                        auto_created=True,
                        primary_key=True,
                        serialize=False,
                        verbose_name="ID",
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("value", models.JSONField()),
                (
                    "hash",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=22,
                        null=True,
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_column="branch_id",
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values",
                        to="lamindb.feature",
                    ),
                ),
            ],
            options={
                "abstract": False,
                "base_manager_name": "objects",
            },
        ),
        migrations.CreateModel(
            name="ArtifactJsonValue",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_jsonvalue",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "jsonvalue",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifact",
                        to="lamindb.jsonvalue",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="artifact",
            name="json_values",
            field=models.ManyToManyField(
                related_name="artifacts",
                through="lamindb.ArtifactJsonValue",
                to="lamindb.jsonvalue",
            ),
        ),
        migrations.CreateModel(
            name="Project",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=255
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "abbr",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=32,
                        null=True,
                    ),
                ),
                (
                    "url",
                    lamindb.base.fields.URLField(
                        blank=True, default=None, max_length=255, null=True
                    ),
                ),
                (
                    "start_date",
                    lamindb.base.fields.DateField(blank=True, default=None, null=True),
                ),
                (
                    "end_date",
                    lamindb.base.fields.DateField(blank=True, default=None, null=True),
                ),
                ("_status_code", models.SmallIntegerField(db_index=True, default=0)),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="projects",
                        through="lamindb.ArtifactProject",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "blocks",
                    models.ManyToManyField(
                        related_name="projects",
                        through="lamindb.BlockProject",
                        to="lamindb.block",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_column="branch_id",
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "collections",
                    models.ManyToManyField(
                        related_name="projects",
                        through="lamindb.CollectionProject",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "features",
                    models.ManyToManyField(
                        related_name="projects",
                        through="lamindb.FeatureProject",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "parents",
                    models.ManyToManyField(
                        related_name="children", to="lamindb.project"
                    ),
                ),
                (
                    "predecessors",
                    models.ManyToManyField(
                        related_name="successors", to="lamindb.project"
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="projects",
                        to="lamindb.project",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(
                lamindb.models.can_curate.CanCurate,
                models.Model,
                lamindb.models.sqlrecord.ValidateFields,
            ),
        ),
        migrations.AddField(
            model_name="featureproject",
            name="project",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_feature",
                to="lamindb.project",
            ),
        ),
        migrations.AddField(
            model_name="collectionproject",
            name="project",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_collection",
                to="lamindb.project",
            ),
        ),
        migrations.AddField(
            model_name="blockproject",
            name="project",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_block",
                to="lamindb.project",
            ),
        ),
        migrations.AddField(
            model_name="artifactproject",
            name="project",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifact",
                to="lamindb.project",
            ),
        ),
        migrations.CreateModel(
            name="Record",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=16,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=150,
                        null=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "reference",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "reference_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=25,
                        null=True,
                    ),
                ),
                ("extra_data", models.JSONField(null=True)),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="records",
                        through="lamindb.ArtifactRecord",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_column="branch_id",
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "collections",
                    models.ManyToManyField(
                        related_name="records",
                        through="lamindb.CollectionRecord",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "parents",
                    models.ManyToManyField(
                        related_name="children", to="lamindb.record"
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="records",
                        to="lamindb.record",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(
                lamindb.models.has_parents.HasParents,
                lamindb.models.can_curate.CanCurate,
                models.Model,
            ),
        ),
        migrations.CreateModel(
            name="ProjectRecord",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_projectrecord",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_record",
                        to="lamindb.project",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="project",
            name="records",
            field=models.ManyToManyField(
                related_name="projects",
                through="lamindb.ProjectRecord",
                to="lamindb.record",
            ),
        ),
        migrations.AddField(
            model_name="collectionrecord",
            name="record",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_collection",
                to="lamindb.record",
            ),
        ),
        migrations.AddField(
            model_name="artifactrecord",
            name="record",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifact",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="RecordArtifact",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordartifact",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_artifact",
                        to="lamindb.record",
                    ),
                ),
                (
                    "value",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_in_record",
                        to="lamindb.artifact",
                    ),
                ),
            ],
            options={
                "unique_together": {("record", "feature", "value")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="artifact",
            name="linked_in_records",
            field=models.ManyToManyField(
                related_name="linked_artifacts",
                through="lamindb.RecordArtifact",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="RecordCollection",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordcollection",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_collection",
                        to="lamindb.record",
                    ),
                ),
                (
                    "value",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_in_record",
                        to="lamindb.collection",
                    ),
                ),
            ],
            options={
                "unique_together": {("record", "feature", "value")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="collection",
            name="linked_in_records",
            field=models.ManyToManyField(
                related_name="linked_collections",
                through="lamindb.RecordCollection",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="RecordProject",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordproject",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_project",
                        to="lamindb.record",
                    ),
                ),
                (
                    "value",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_in_record",
                        to="lamindb.project",
                    ),
                ),
            ],
            options={
                "unique_together": {("record", "feature", "value")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="project",
            name="linked_in_records",
            field=models.ManyToManyField(
                related_name="linked_projects",
                through="lamindb.RecordProject",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="RecordRecord",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordrecord",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_record",
                        to="lamindb.record",
                    ),
                ),
                (
                    "value",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_record",
                        to="lamindb.record",
                    ),
                ),
            ],
            options={
                "unique_together": {("record", "feature", "value")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="record",
            name="linked_records",
            field=models.ManyToManyField(
                related_name="linked_in_records",
                through="lamindb.RecordRecord",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="RecordReference",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordreference",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_reference",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.CreateModel(
            name="RecordRun",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordrun",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_run",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.CreateModel(
            name="RecordTransform",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordtransform",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_transform",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.CreateModel(
            name="RecordULabel",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordulabel",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_ulabel",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.CreateModel(
            name="RecordUser",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recorduser",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_user",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.CreateModel(
            name="Reference",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=255
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "abbr",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=32,
                        null=True,
                    ),
                ),
                (
                    "url",
                    lamindb.base.fields.URLField(blank=True, db_index=True, null=True),
                ),
                (
                    "pubmed_id",
                    lamindb.base.fields.BigIntegerField(
                        blank=True, db_index=True, default=None, null=True
                    ),
                ),
                (
                    "doi",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                        validators=[
                            django.core.validators.RegexValidator(
                                message="Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)",
                                regex="^(?:https?://(?:dx\\.)?doi\\.org/|doi:|DOI:)?10\\.\\d+/.*$",
                            )
                        ],
                    ),
                ),
                (
                    "text",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "date",
                    lamindb.base.fields.DateField(blank=True, default=None, null=True),
                ),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="references",
                        through="lamindb.ArtifactReference",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_column="branch_id",
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "collections",
                    models.ManyToManyField(
                        related_name="references",
                        through="lamindb.CollectionReference",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "linked_in_records",
                    models.ManyToManyField(
                        related_name="linked_references",
                        through="lamindb.RecordReference",
                        to="lamindb.record",
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="references",
                        to="lamindb.reference",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(
                lamindb.models.can_curate.CanCurate,
                models.Model,
                lamindb.models.sqlrecord.ValidateFields,
            ),
        ),
        migrations.AddField(
            model_name="recordreference",
            name="value",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_in_record",
                to="lamindb.reference",
            ),
        ),
        migrations.AddField(
            model_name="project",
            name="references",
            field=models.ManyToManyField(
                related_name="projects", to="lamindb.reference"
            ),
        ),
        migrations.AddField(
            model_name="collectionreference",
            name="reference",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_collection",
                to="lamindb.reference",
            ),
        ),
        migrations.AddField(
            model_name="artifactreference",
            name="reference",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifact",
                to="lamindb.reference",
            ),
        ),
        migrations.CreateModel(
            name="ReferenceRecord",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_referencerecord",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_reference",
                        to="lamindb.record",
                    ),
                ),
                (
                    "reference",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_record",
                        to="lamindb.reference",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="reference",
            name="records",
            field=models.ManyToManyField(
                related_name="references",
                through="lamindb.ReferenceRecord",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="Run",
            fields=[
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=150,
                        null=True,
                    ),
                ),
                (
                    "entrypoint",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "started_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "finished_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True, db_index=True, default=None, null=True
                    ),
                ),
                ("params", models.JSONField(null=True)),
                (
                    "reference",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "reference_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=25,
                        null=True,
                    ),
                ),
                (
                    "cli_args",
                    lamindb.base.fields.CharField(
                        blank=True, default=None, max_length=1024, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_is_consecutive",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(
                        db_default=-3, db_index=True, default=-3, null=True
                    ),
                ),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="runs",
                        through="lamindb.ArtifactRun",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_column="branch_id",
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "environment",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="_environment_of",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "initiated_by_run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="initiated_runs",
                        to="lamindb.run",
                    ),
                ),
                (
                    "linked_in_records",
                    models.ManyToManyField(
                        related_name="linked_runs",
                        through="lamindb.RecordRun",
                        to="lamindb.record",
                    ),
                ),
                (
                    "report",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="_report_of",
                        to="lamindb.artifact",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="referencerecord",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="reference",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="recordrun",
            name="value",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_in_record",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="record",
            name="input_of_runs",
            field=models.ManyToManyField(
                related_name="input_records", to="lamindb.run"
            ),
        ),
        migrations.AddField(
            model_name="record",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                editable=False,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="output_records",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="projectrecord",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="project",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="jsonvalue",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="featureproject",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="feature",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="collectionreference",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="collectionrecord",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="collectionproject",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="collectionartifact",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="collection",
            name="input_of_runs",
            field=models.ManyToManyField(
                related_name="input_collections", to="lamindb.run"
            ),
        ),
        migrations.AddField(
            model_name="collection",
            name="recreating_runs",
            field=models.ManyToManyField(
                related_name="recreated_collections", to="lamindb.run"
            ),
        ),
        migrations.AddField(
            model_name="collection",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="output_collections",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="blockproject",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.CreateModel(
            name="ArtifactUser",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_user",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactuser",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="ArtifactULabel",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_ulabel",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactulabel",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="artifactrun",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="links_artifact",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifactreference",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifactrecord",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifactproject",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifactjsonvalue",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifactartifact",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="input_of_runs",
            field=models.ManyToManyField(
                related_name="input_artifacts", to="lamindb.run"
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="recreating_runs",
            field=models.ManyToManyField(
                related_name="recreated_artifacts", to="lamindb.run"
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                editable=False,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="output_artifacts",
                to="lamindb.run",
            ),
        ),
        migrations.CreateModel(
            name="RunJsonValue",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "jsonvalue",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_run",
                        to="lamindb.jsonvalue",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_jsonvalue",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="run",
            name="json_values",
            field=models.ManyToManyField(
                related_name="runs",
                through="lamindb.RunJsonValue",
                to="lamindb.jsonvalue",
            ),
        ),
        migrations.CreateModel(
            name="RunProject",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_run",
                        to="lamindb.project",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="project",
            name="runs",
            field=models.ManyToManyField(
                related_name="projects", through="lamindb.RunProject", to="lamindb.run"
            ),
        ),
        migrations.CreateModel(
            name="RunRecord",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_runrecord",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_run",
                        to="lamindb.record",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_record",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="record",
            name="runs",
            field=models.ManyToManyField(
                related_name="records", through="lamindb.RunRecord", to="lamindb.run"
            ),
        ),
        migrations.CreateModel(
            name="Schema",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=16,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=150,
                        null=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "n_members",
                    lamindb.base.fields.IntegerField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "coerce",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "flexible",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "itype",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=120,
                        null=True,
                    ),
                ),
                (
                    "otype",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=64,
                        null=True,
                    ),
                ),
                (
                    "_dtype_str",
                    lamindb.base.fields.CharField(
                        blank=True,
                        default=None,
                        editable=False,
                        max_length=64,
                        null=True,
                    ),
                ),
                (
                    "hash",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=22,
                        null=True,
                    ),
                ),
                (
                    "minimal_set",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True, editable=False
                    ),
                ),
                (
                    "ordered_set",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=False, editable=False
                    ),
                ),
                (
                    "maximal_set",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=False, editable=False
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_column="branch_id",
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="schemas",
                        to="lamindb.schema",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(lamindb.models.can_curate.CanCurate, models.Model),
        ),
        migrations.AddField(
            model_name="record",
            name="schema",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                null=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="records",
                to="lamindb.schema",
            ),
        ),
        migrations.CreateModel(
            name="ArtifactSchema",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "slot",
                    lamindb.base.fields.CharField(
                        blank=True, default=None, max_length=255, null=True
                    ),
                ),
                (
                    "feature_ref_is_semantic",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="_links_schema",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "schema",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="_links_artifact",
                        to="lamindb.schema",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="artifact",
            name="schema",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="validated_artifacts",
                to="lamindb.schema",
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="schemas",
            field=models.ManyToManyField(
                related_name="artifacts",
                through="lamindb.ArtifactSchema",
                to="lamindb.schema",
            ),
        ),
        migrations.CreateModel(
            name="SchemaComponent",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "slot",
                    lamindb.base.fields.CharField(
                        blank=True, default=None, max_length=255, null=True
                    ),
                ),
                (
                    "component",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_composite",
                        to="lamindb.schema",
                    ),
                ),
                (
                    "composite",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_component",
                        to="lamindb.schema",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="schema",
            name="components",
            field=models.ManyToManyField(
                related_name="composites",
                through="lamindb.SchemaComponent",
                to="lamindb.schema",
            ),
        ),
        migrations.CreateModel(
            name="SchemaFeature",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_schema",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "schema",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_feature",
                        to="lamindb.schema",
                    ),
                ),
            ],
            options={
                "unique_together": {("schema", "feature")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="feature",
            name="schemas",
            field=models.ManyToManyField(
                related_name="features",
                through="lamindb.SchemaFeature",
                to="lamindb.schema",
            ),
        ),
        migrations.CreateModel(
            name="SchemaProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_schema",
                        to="lamindb.project",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "schema",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.schema",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="project",
            name="schemas",
            field=models.ManyToManyField(
                related_name="projects",
                through="lamindb.SchemaProject",
                to="lamindb.schema",
            ),
        ),
        migrations.AddField(
            model_name="schema",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="run",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="reference",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="record",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="project",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="jsonvalue",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="feature",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="collection",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="branch",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="block",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.CreateModel(
            name="Storage",
            fields=[
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "root",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        unique=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "type",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=30
                    ),
                ),
                (
                    "region",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=64,
                        null=True,
                    ),
                ),
                (
                    "instance_uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=12,
                        null=True,
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_column="branch_id",
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "space",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.space",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
        ),
        migrations.AddField(
            model_name="artifact",
            name="storage",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="artifacts",
                to="lamindb.storage",
            ),
        ),
        migrations.CreateModel(
            name="Transform",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=16,
                        unique=True,
                    ),
                ),
                (
                    "key",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=1024
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(
                        blank=True, db_index=True, default=None, null=True
                    ),
                ),
                (
                    "kind",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default="pipeline", max_length=20
                    ),
                ),
                (
                    "source_code",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "hash",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=22,
                        null=True,
                    ),
                ),
                (
                    "reference",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "reference_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=25,
                        null=True,
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_column="branch_id",
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "environment",
                    models.ForeignKey(
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="_environment_of_transforms",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "linked_in_records",
                    models.ManyToManyField(
                        related_name="linked_transforms",
                        through="lamindb.RecordTransform",
                        to="lamindb.record",
                    ),
                ),
                (
                    "space",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.space",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
        ),
        migrations.AddField(
            model_name="run",
            name="transform",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="runs",
                to="lamindb.transform",
            ),
        ),
        migrations.AddField(
            model_name="recordtransform",
            name="value",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_in_record",
                to="lamindb.transform",
            ),
        ),
        migrations.CreateModel(
            name="TransformProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_transform",
                        to="lamindb.project",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "transform",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.transform",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="project",
            name="transforms",
            field=models.ManyToManyField(
                related_name="projects",
                through="lamindb.TransformProject",
                to="lamindb.transform",
            ),
        ),
        migrations.CreateModel(
            name="TransformRecord",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        editable=False,
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_transformrecord",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_transform",
                        to="lamindb.record",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "transform",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_record",
                        to="lamindb.transform",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="record",
            name="transforms",
            field=models.ManyToManyField(
                related_name="records",
                through="lamindb.TransformRecord",
                to="lamindb.transform",
            ),
        ),
        migrations.CreateModel(
            name="TransformReference",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "reference",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_transform",
                        to="lamindb.reference",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "transform",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_reference",
                        to="lamindb.transform",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="reference",
            name="transforms",
            field=models.ManyToManyField(
                related_name="references",
                through="lamindb.TransformReference",
                to="lamindb.transform",
            ),
        ),
        migrations.CreateModel(
            name="TransformTransform",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                ("config", models.JSONField(default=None, null=True)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        editable=False,
                    ),
                ),
                (
                    "predecessor",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_successor",
                        to="lamindb.transform",
                    ),
                ),
                (
                    "successor",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_predecessor",
                        to="lamindb.transform",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="transform",
            name="predecessors",
            field=models.ManyToManyField(
                related_name="successors",
                through="lamindb.TransformTransform",
                to="lamindb.transform",
            ),
        ),
        migrations.CreateModel(
            name="ULabel",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_8,
                        editable=False,
                        max_length=8,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=150
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "reference",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "reference_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=25,
                        null=True,
                    ),
                ),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="ulabels",
                        through="lamindb.ArtifactULabel",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_column="branch_id",
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "linked_in_records",
                    models.ManyToManyField(
                        related_name="linked_ulabels",
                        through="lamindb.RecordULabel",
                        to="lamindb.record",
                    ),
                ),
                (
                    "parents",
                    models.ManyToManyField(
                        related_name="children", to="lamindb.ulabel"
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "space",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.space",
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="ulabels",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(
                lamindb.models.has_parents.HasParents,
                lamindb.models.can_curate.CanCurate,
                models.Model,
            ),
        ),
        migrations.CreateModel(
            name="TransformULabel",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "transform",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_ulabel",
                        to="lamindb.transform",
                    ),
                ),
                (
                    "ulabel",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_transform",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="transform",
            name="ulabels",
            field=models.ManyToManyField(
                related_name="transforms",
                through="lamindb.TransformULabel",
                to="lamindb.ulabel",
            ),
        ),
        migrations.CreateModel(
            name="RunULabel",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_ulabel",
                        to="lamindb.run",
                    ),
                ),
                (
                    "ulabel",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_run",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="run",
            name="ulabels",
            field=models.ManyToManyField(
                related_name="runs", through="lamindb.RunULabel", to="lamindb.ulabel"
            ),
        ),
        migrations.AddField(
            model_name="recordulabel",
            name="value",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_record",
                to="lamindb.ulabel",
            ),
        ),
        migrations.CreateModel(
            name="CollectionULabel",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "collection",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_ulabel",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_collectionulabel",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "ulabel",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_collection",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="collection",
            name="ulabels",
            field=models.ManyToManyField(
                related_name="collections",
                through="lamindb.CollectionULabel",
                to="lamindb.ulabel",
            ),
        ),
        migrations.AddField(
            model_name="artifactulabel",
            name="ulabel",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifact",
                to="lamindb.ulabel",
            ),
        ),
        migrations.CreateModel(
            name="ULabelProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_ulabel",
                        to="lamindb.project",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "ulabel",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="project",
            name="ulabels",
            field=models.ManyToManyField(
                related_name="projects",
                through="lamindb.ULabelProject",
                to="lamindb.ulabel",
            ),
        ),
        migrations.CreateModel(
            name="User",
            fields=[
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=8,
                        unique=True,
                    ),
                ),
                (
                    "handle",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=150,
                        null=True,
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="users",
                        through="lamindb.ArtifactUser",
                        through_fields=("user", "artifact"),
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "linked_in_records",
                    models.ManyToManyField(
                        related_name="linked_users",
                        through="lamindb.RecordUser",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.can_curate.CanCurate),
        ),
        migrations.AddField(
            model_name="ulabelproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="ULabelBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "ulabel",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.ulabel",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="ulabel",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="transformulabel",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="transformtransform",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="transformreference",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="transformrecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="transformproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="TransformBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                ("line_number", models.IntegerField(null=True)),
                (
                    "transform",
                    models.ForeignKey(
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.transform",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="transform",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="created_transforms",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="storage",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="SpaceBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "space",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.space",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="space",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                null=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="schemaproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="schemacomponent",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="SchemaBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "schema",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.schema",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="schema",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="runulabel",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="runrecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="runproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="runjsonvalue",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="RunBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "run",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.run",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="run",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="created_runs",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="referencerecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="reference",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="recorduser",
            name="value",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_record",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="RecordBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "record",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.record",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="record",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="projectrecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="ProjectBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "project",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.project",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="project",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="jsonvalue",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="featureproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="FeatureBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "feature",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="feature",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="collectionulabel",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="collectionreference",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="collectionrecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="collectionproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="CollectionBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "collection",
                    models.ForeignKey(
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="collectionartifact",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="collection",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="BranchBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "branch",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="branch",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                null=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="blockproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="block",
            name="created_by",
            field=models.ForeignKey(
                default=None,
                null=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactuser",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactuser",
            name="user",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifact",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactulabel",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactschema",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactrun",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactreference",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactrecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactjsonvalue",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="ArtifactBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="mdpage",
                        db_index=True,
                        default="mdpage",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "artifact",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="artifactartifact",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="created_artifacts",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="RecordJson",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "value",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordjson",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_json",
                        to="lamindb.record",
                    ),
                ),
            ],
            options={
                "unique_together": {("record", "feature")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AlterUniqueTogether(
            name="recordreference",
            unique_together={("record", "feature", "value")},
        ),
        migrations.AlterUniqueTogether(
            name="recordrun",
            unique_together={("record", "feature", "value")},
        ),
        migrations.AlterUniqueTogether(
            name="recordtransform",
            unique_together={("record", "feature", "value")},
        ),
        migrations.AlterUniqueTogether(
            name="recordulabel",
            unique_together={("record", "feature", "value")},
        ),
        migrations.AlterUniqueTogether(
            name="ulabelproject",
            unique_together={("ulabel", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="transformulabel",
            unique_together={("transform", "ulabel")},
        ),
        migrations.AlterUniqueTogether(
            name="transformtransform",
            unique_together={("successor", "predecessor")},
        ),
        migrations.AlterUniqueTogether(
            name="transformreference",
            unique_together={("transform", "reference")},
        ),
        migrations.AlterUniqueTogether(
            name="transformrecord",
            unique_together={("transform", "record", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="transformproject",
            unique_together={("transform", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="transform",
            unique_together={("key", "hash")},
        ),
        migrations.AddConstraint(
            model_name="space",
            constraint=models.UniqueConstraint(
                django.db.models.functions.text.Lower("name"),
                name="unique_space_name_lower",
            ),
        ),
        migrations.AlterUniqueTogether(
            name="schemaproject",
            unique_together={("schema", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="schemacomponent",
            unique_together={("composite", "slot"), ("composite", "slot", "component")},
        ),
        migrations.AlterUniqueTogether(
            name="runulabel",
            unique_together={("run", "ulabel")},
        ),
        migrations.AlterUniqueTogether(
            name="runrecord",
            unique_together={("run", "record", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="runproject",
            unique_together={("run", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="runjsonvalue",
            unique_together={("run", "jsonvalue")},
        ),
        migrations.AlterUniqueTogether(
            name="referencerecord",
            unique_together={("reference", "feature", "record")},
        ),
        migrations.AlterUniqueTogether(
            name="recorduser",
            unique_together={("record", "feature", "value")},
        ),
        migrations.AlterUniqueTogether(
            name="projectrecord",
            unique_together={("project", "feature", "record")},
        ),
        migrations.AlterUniqueTogether(
            name="jsonvalue",
            unique_together={("feature", "hash")},
        ),
        migrations.AlterUniqueTogether(
            name="featureproject",
            unique_together={("feature", "project")},
        ),
        migrations.AddConstraint(
            model_name="feature",
            constraint=models.CheckConstraint(
                condition=models.Q(
                    ("is_type", True), ("_dtype_str__isnull", False), _connector="OR"
                ),
                name="feature_dtype_str_not_null_when_is_type_false",
            ),
        ),
        migrations.AlterUniqueTogether(
            name="collectionulabel",
            unique_together={("collection", "ulabel")},
        ),
        migrations.AlterUniqueTogether(
            name="collectionreference",
            unique_together={("collection", "reference")},
        ),
        migrations.AlterUniqueTogether(
            name="collectionrecord",
            unique_together={("collection", "record", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="collectionproject",
            unique_together={("collection", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="collectionartifact",
            unique_together={("collection", "artifact")},
        ),
        migrations.AddConstraint(
            model_name="collection",
            constraint=models.UniqueConstraint(
                fields=("key", "hash"), name="unique_collection_key_hash_not_null"
            ),
        ),
        migrations.AddConstraint(
            model_name="branch",
            constraint=models.UniqueConstraint(
                django.db.models.functions.text.Lower("name"),
                name="unique_branch_name_lower",
            ),
        ),
        migrations.AlterUniqueTogether(
            name="blockproject",
            unique_together={("block", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactuser",
            unique_together={("artifact", "user", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactulabel",
            unique_together={("artifact", "ulabel", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactschema",
            unique_together={("artifact", "schema"), ("artifact", "slot")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactrun",
            unique_together={("artifact", "run", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactreference",
            unique_together={("artifact", "reference", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactrecord",
            unique_together={("artifact", "record", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactproject",
            unique_together={("artifact", "project", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactjsonvalue",
            unique_together={("artifact", "jsonvalue")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactartifact",
            unique_together={("artifact", "value", "feature")},
        ),
        migrations.AddConstraint(
            model_name="artifact",
            constraint=models.UniqueConstraint(
                condition=models.Q(("key__isnull", False)),
                fields=("storage", "key", "hash"),
                name="unique_artifact_storage_key_hash_not_null",
            ),
        ),
        migrations.AddConstraint(
            model_name="artifact",
            constraint=models.UniqueConstraint(
                condition=models.Q(("key__isnull", True)),
                fields=("storage", "hash"),
                name="unique_artifact_storage_hash_null_key",
            ),
        ),
        migrations.RunPython(apply_constraints),
    ]


if connection.vendor == "postgresql":
    Migration.operations += [
        pgtrigger.migrations.AddTrigger(
            model_name="ulabel",
            trigger=pgtrigger.compiler.Trigger(
                name="prevent_ulabel_type_cycle",
                sql=pgtrigger.compiler.UpsertTriggerSql(
                    condition="WHEN (NEW.type_id IS NOT NULL)",
                    func="\n                        -- Check for direct self-reference\n                        IF NEW.type_id = NEW.id THEN\n                            RAISE EXCEPTION 'Cannot set type: ulabel cannot be its own type';\n                        END IF;\n\n                        -- Check for cycles in the type chain\n                        IF EXISTS (\n                            WITH RECURSIVE type_chain AS (\n                                SELECT type_id, 1 as depth\n                                FROM lamindb_ulabel\n                                WHERE id = NEW.type_id\n\n                                UNION ALL\n\n                                SELECT r.type_id, tc.depth + 1\n                                FROM lamindb_ulabel r\n                                INNER JOIN type_chain tc ON r.id = tc.type_id\n                                WHERE tc.depth < 100\n                            )\n                            SELECT 1 FROM type_chain WHERE type_id = NEW.id\n                        ) THEN\n                            RAISE EXCEPTION 'Cannot set type: would create a cycle';\n                        END IF;\n\n                        RETURN NEW;\n                    ",
                    hash="53487a8e36a64748418457f7229de6d5cf31e6bd",
                    operation="UPDATE OR INSERT",
                    pgid="pgtrigger_prevent_ulabel_type_cycle_863ae",
                    table="lamindb_ulabel",
                    when="BEFORE",
                ),
            ),
        ),
        pgtrigger.migrations.AddTrigger(
            model_name="record",
            trigger=pgtrigger.compiler.Trigger(
                name="prevent_record_type_cycle",
                sql=pgtrigger.compiler.UpsertTriggerSql(
                    condition="WHEN (NEW.type_id IS NOT NULL)",
                    func="\n                        -- Check for direct self-reference\n                        IF NEW.type_id = NEW.id THEN\n                            RAISE EXCEPTION 'Cannot set type: record cannot be its own type';\n                        END IF;\n\n                        -- Check for cycles in the type chain\n                        IF EXISTS (\n                            WITH RECURSIVE type_chain AS (\n                                SELECT type_id, 1 as depth\n                                FROM lamindb_record\n                                WHERE id = NEW.type_id\n\n                                UNION ALL\n\n                                SELECT r.type_id, tc.depth + 1\n                                FROM lamindb_record r\n                                INNER JOIN type_chain tc ON r.id = tc.type_id\n                                WHERE tc.depth < 100\n                            )\n                            SELECT 1 FROM type_chain WHERE type_id = NEW.id\n                        ) THEN\n                            RAISE EXCEPTION 'Cannot set type: would create a cycle';\n                        END IF;\n\n                        RETURN NEW;\n                    ",
                    hash="deaab832a066dfec76228f5b7a62a08f334876a9",
                    operation="UPDATE OR INSERT",
                    pgid="pgtrigger_prevent_record_type_cycle_56c18",
                    table="lamindb_record",
                    when="BEFORE",
                ),
            ),
        ),
        pgtrigger.migrations.AddTrigger(
            model_name="feature",
            trigger=pgtrigger.compiler.Trigger(
                name="update_feature_on_name_change",
                sql=pgtrigger.compiler.UpsertTriggerSql(
                    condition="WHEN (OLD.name IS DISTINCT FROM NEW.name)",
                    func="DECLARE\n    old_renamed JSONB;\n    new_renamed JSONB;\n    ts TEXT;\nBEGIN\n    -- Only proceed if name actually changed\n    IF OLD.name IS DISTINCT FROM NEW.name THEN\n        -- Update synonyms\n        IF NEW.synonyms IS NULL OR NEW.synonyms = '' THEN\n            NEW.synonyms := OLD.name;\n        ELSIF position(OLD.name in NEW.synonyms) = 0 THEN\n            NEW.synonyms := NEW.synonyms || '|' || OLD.name;\n        END IF;\n\n        -- Update _aux with rename history\n        ts := TO_CHAR(NOW() AT TIME ZONE 'UTC', 'YYYY-MM-DD\"T\"HH24:MI:SS\"Z\"');\n\n        -- Get existing renamed history or initialize empty object\n        old_renamed := COALESCE((OLD._aux->>'renamed')::JSONB, '{}'::JSONB);\n\n        -- Add old name with timestamp\n        new_renamed := old_renamed || jsonb_build_object(ts, OLD.name);\n\n        -- Update _aux with new renamed history\n        IF NEW._aux IS NULL THEN\n            NEW._aux := jsonb_build_object('renamed', new_renamed);\n        ELSE\n            NEW._aux := NEW._aux || jsonb_build_object('renamed', new_renamed);\n        END IF;\n    END IF;\n\n    RETURN NEW;\nEND;\n",
                    hash="5f2e7a65e42c34b0455f0840def52f078726e401",
                    operation="UPDATE",
                    pgid="pgtrigger_update_feature_on_name_change_6c32d",
                    table="lamindb_feature",
                    when="BEFORE",
                ),
            ),
        ),
    ]


================================================
FILE: lamindb/migrations/0178_v2_2.py
================================================
# Generated by Django 5.2 on 2026-02-15 11:25

import django.db.models.deletion
from django.db import migrations, models

import lamindb.base.fields
import lamindb.models.sqlrecord


class Migration(migrations.Migration):
    dependencies = [
        ("lamindb", "0177_squashed"),
    ]

    operations = [
        migrations.AddField(
            model_name="artifact",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="block",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="collection",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="feature",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="jsonvalue",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="project",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="record",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="reference",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="run",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="run",
            name="description",
            field=lamindb.base.fields.TextField(blank=True, default=None, null=True),
        ),
        migrations.AddField(
            model_name="run",
            name="plan",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="_plan_for_runs",
                to="lamindb.artifact",
            ),
        ),
        migrations.AddField(
            model_name="schema",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="storage",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="transform",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="transform",
            name="plan",
            field=models.ForeignKey(
                default=None,
                null=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="_plan_for_transforms",
                to="lamindb.artifact",
            ),
        ),
        migrations.AddField(
            model_name="ulabel",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="artifact",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="block",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="collection",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="feature",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="jsonvalue",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="project",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="record",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="reference",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="run",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="schema",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="storage",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="transform",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="ulabel",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.CreateModel(
            name="BranchPlan",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_branchplan",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_branchplan",
                        to="lamindb.branch",
                    ),
                ),
            ],
            options={
                "unique_together": {("branch", "artifact")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="branch",
            name="plans",
            field=models.ManyToManyField(
                related_name="_plan_for_branches",
                through="lamindb.BranchPlan",
                to="lamindb.artifact",
            ),
        ),
        migrations.CreateModel(
            name="BranchProject",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_branch",
                        to="lamindb.project",
                    ),
                ),
            ],
            options={
                "unique_together": {("branch", "project")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="branch",
            name="projects",
            field=models.ManyToManyField(
                related_name="branches",
                through="lamindb.BranchProject",
                to="lamindb.project",
            ),
        ),
        migrations.CreateModel(
            name="BranchULabel",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_ulabel",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "ulabel",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_branch",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            options={
                "unique_together": {("branch", "ulabel")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="branch",
            name="ulabels",
            field=models.ManyToManyField(
                related_name="branches",
                through="lamindb.BranchULabel",
                to="lamindb.ulabel",
            ),
        ),
        migrations.CreateModel(
            name="BranchUser",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "role",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=32
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_user",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "user",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_branch",
                        to="lamindb.user",
                    ),
                ),
            ],
            options={
                "unique_together": {("branch", "user", "role")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="branch",
            name="users",
            field=models.ManyToManyField(
                related_name="branches", through="lamindb.BranchUser", to="lamindb.user"
            ),
        ),
        migrations.CreateModel(
            name="ProjectUser",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "role",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=32
                    ),
                ),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_user",
                        to="lamindb.project",
                    ),
                ),
                (
                    "user",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_project",
                        to="lamindb.user",
                    ),
                ),
            ],
            options={
                "unique_together": {("project", "user", "role")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="project",
            name="users",
            field=models.ManyToManyField(
                related_name="projects",
                through="lamindb.ProjectUser",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="RunArtifact",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_runartifact",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_runartifact",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_runartifact",
                        to="lamindb.run",
                    ),
                ),
            ],
            options={
                "unique_together": {("run", "artifact", "feature")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="run",
            name="linked_artifacts",
            field=models.ManyToManyField(
                related_name="linked_by_runs",
                through="lamindb.RunArtifact",
                to="lamindb.artifact",
            ),
        ),
    ]


================================================
FILE: lamindb/migrations/0179_v2_2_part_2.py
================================================
# Generated by Django 5.2 on 2026-02-15 14:12

import django.db.models.deletion
from django.db import migrations, models

import lamindb.base.fields
import lamindb.models.sqlrecord


class Migration(migrations.Migration):
    dependencies = [
        ("lamindb", "0178_v2_2"),
    ]

    operations = [
        migrations.RemoveField(
            model_name="branch",
            name="plans",
        ),
        migrations.CreateModel(
            name="BranchArtifact",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_branch",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_artifact",
                        to="lamindb.branch",
                    ),
                ),
            ],
            options={
                "unique_together": {("branch", "artifact")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="branch",
            name="artifacts",
            field=models.ManyToManyField(
                related_name="linked_by_branches",
                through="lamindb.BranchArtifact",
                to="lamindb.artifact",
            ),
        ),
        migrations.DeleteModel(
            name="BranchPlan",
        ),
    ]


================================================
FILE: lamindb/migrations/0180_v2_2_part_3.py
================================================
# Generated by Django 5.2 on 2026-02-15 14:29

import django.db.models.deletion
from django.db import migrations

import lamindb.base.fields


class Migration(migrations.Migration):
    dependencies = [
        ("lamindb", "0179_v2_2_part_2"),
    ]

    operations = [
        migrations.AlterField(
            model_name="runartifact",
            name="artifact",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_in_run",
                to="lamindb.artifact",
            ),
        ),
        migrations.AlterField(
            model_name="runartifact",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="values_artifact",
                to="lamindb.run",
            ),
        ),
    ]


================================================
FILE: lamindb/migrations/0181_v2_2_part_4.py
================================================
# Generated by Django 5.2 on 2026-02-15 15:43

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):
    dependencies = [
        ("lamindb", "0180_v2_2_part_3"),
    ]

    operations = [
        migrations.AddField(
            model_name="block",
            name="anchor",
            field=models.ForeignKey(
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="children",
                to="lamindb.block",
            ),
        ),
        migrations.AlterField(
            model_name="block",
            name="key",
            field=models.CharField(db_index=True, max_length=1024, null=True),
        ),
    ]


================================================
FILE: lamindb/migrations/0182_v2_2_part_5.py
================================================
# Generated by Django 5.2 on 2026-02-17 16:33

import django.db.models.deletion
from django.db import migrations, models

import lamindb.base.fields
import lamindb.base.users


class Migration(migrations.Migration):
    dependencies = [
        ("lamindb", "0181_v2_2_part_4"),
    ]

    operations = [
        migrations.AddField(
            model_name="branch",
            name="_status_code",
            field=models.SmallIntegerField(db_index=True, default=0),
        ),
        migrations.AlterField(
            model_name="artifactblock",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="artifactblock",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AlterField(
            model_name="block",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="block",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AlterField(
            model_name="branch",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="branchblock",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="branchblock",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AlterField(
            model_name="collectionblock",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="collectionblock",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AlterField(
            model_name="featureblock",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="featureblock",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AlterField(
            model_name="projectblock",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="projectblock",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AlterField(
            model_name="recordblock",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="recordblock",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AlterField(
            model_name="runblock",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="runblock",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AlterField(
            model_name="schemablock",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="schemablock",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AlterField(
            model_name="spaceblock",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="spaceblock",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AlterField(
            model_name="transformblock",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="transformblock",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AlterField(
            model_name="ulabelblock",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AlterField(
            model_name="ulabelblock",
            name="kind",
            field=models.CharField(
                db_default="readme", db_index=True, default="readme", max_length=22
            ),
        ),
        migrations.AddField(
            model_name="artifactblock",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AddField(
            model_name="block",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AddField(
            model_name="branchblock",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AddField(
            model_name="collectionblock",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AddField(
            model_name="featureblock",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AddField(
            model_name="projectblock",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AddField(
            model_name="recordblock",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AddField(
            model_name="runblock",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AddField(
            model_name="schemablock",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AddField(
            model_name="spaceblock",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AddField(
            model_name="transformblock",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AddField(
            model_name="ulabelblock",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AlterField(
            model_name="branch",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AlterField(
            model_name="project",
            name="_status_code",
            field=models.SmallIntegerField(db_default=0, db_index=True, default=0),
        ),
        migrations.AlterField(
            model_name="run",
            name="_status_code",
            field=models.SmallIntegerField(db_default=-3, db_index=True, default=-3),
        ),
    ]


================================================
FILE: lamindb/migrations/0183_v2_2_part_6.py
================================================
# Generated by Django 5.2 on 2026-02-17 23:04

from django.db import migrations


class Migration(migrations.Migration):
    dependencies = [
        ("lamindb", "0182_v2_2_part_5"),
    ]

    operations = [
        migrations.RemoveField(
            model_name="branch",
            name="artifacts",
        ),
        migrations.DeleteModel(
            name="BranchArtifact",
        ),
    ]


================================================
FILE: lamindb/migrations/0184_alter_transformrecord_feature.py
================================================
# Generated by Django 5.2 on 2026-03-07 12:16

import django.db.models.deletion
from django.db import migrations

import lamindb.base.fields


class Migration(migrations.Migration):
    dependencies = [
        ("lamindb", "0183_v2_2_part_6"),
    ]

    operations = [
        migrations.AlterField(
            model_name="transformrecord",
            name="feature",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_transformrecord",
                to="lamindb.feature",
            ),
        ),
    ]


================================================
FILE: lamindb/migrations/0185_alter_runrecord_feature.py
================================================
# Generated by Django 5.2 on 2026-04-05 14:32

import django.db.models.deletion
from django.db import migrations

import lamindb.base.fields


class Migration(migrations.Migration):
    dependencies = [
        ("lamindb", "0184_alter_transformrecord_feature"),
    ]

    operations = [
        migrations.AlterField(
            model_name="runrecord",
            name="feature",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_runrecord",
                to="lamindb.feature",
            ),
        ),
    ]


================================================
FILE: lamindb/migrations/0186_v2_4.py
================================================
# Generated by Django 5.2 on 2026-04-12 18:49

import django.db.models.deletion
from django.db import migrations, models

import lamindb.base.fields


class Migration(migrations.Migration):
    dependencies = [
        ("lamindb", "0185_alter_runrecord_feature"),
    ]

    operations = [
        migrations.AddField(
            model_name="artifactblock",
            name="branch",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="artifactblock",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="branchblock",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="collectionblock",
            name="branch",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="collectionblock",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="featureblock",
            name="branch",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="featureblock",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="projectblock",
            name="branch",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="projectblock",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="recordblock",
            name="branch",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="recordblock",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="runblock",
            name="branch",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="runblock",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="schemablock",
            name="branch",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="schemablock",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="spaceblock",
            name="branch",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="spaceblock",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="transformblock",
            name="branch",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="transformblock",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="ulabelblock",
            name="branch",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="ulabelblock",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="block",
            name="branch",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="block",
            name="created_on",
            field=models.ForeignKey(
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="branch",
            name="_aux",
            field=lamindb.base.fields.JSONField(
                blank=True, db_default=None, default=None, null=True
            ),
        ),
    ]


================================================
FILE: lamindb/migrations/0187_squashed.py
================================================
# Generated by Django 5.2 on 2026-04-16 06:44

import django.core.validators
import django.db.models.deletion
import django.db.models.functions.datetime
import django.db.models.functions.text
import pgtrigger.compiler
import pgtrigger.migrations
from django.db import connection, migrations, models

import lamindb.base.fields
import lamindb.base.uids
import lamindb.base.users
import lamindb.models.can_curate
import lamindb.models.has_parents
import lamindb.models.run
import lamindb.models.sqlrecord

CREATE_IS_VALID_RECORD_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_record_type(record_type_id INTEGER, record_is_type BOOLEAN)
RETURNS BOOLEAN AS $$
BEGIN
    -- Record with no type is valid
    IF record_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    -- If current record is a type, it can only reference schema-less types
    IF record_is_type THEN
        RETURN EXISTS (
            SELECT 1 FROM lamindb_record r
            WHERE r.id = record_type_id AND r.is_type AND r.schema_id IS NULL
        );
    END IF;

    -- Regular records can reference any type
    RETURN EXISTS (
        SELECT 1 FROM lamindb_record r
        WHERE r.id = record_type_id AND r.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_RECORD_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_record
ADD CONSTRAINT record_type_is_valid_fk
CHECK (is_valid_record_type(type_id, is_type));
"""


CREATE_IS_VALID_FEATURE_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_feature_type(feature_type_id INTEGER)
RETURNS BOOLEAN AS $$
BEGIN
    -- Feature with no type is valid
    IF feature_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    -- Type must have is_type = TRUE
    RETURN EXISTS (
        SELECT 1 FROM lamindb_feature f
        WHERE f.id = feature_type_id AND f.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_feature
ADD CONSTRAINT feature_type_is_valid_fk
CHECK (is_valid_feature_type(type_id));
"""


CREATE_IS_VALID_SCHEMA_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_schema_type(schema_type_id INTEGER)
RETURNS BOOLEAN AS $$
BEGIN
    IF schema_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    RETURN EXISTS (
        SELECT 1 FROM lamindb_schema s
        WHERE s.id = schema_type_id AND s.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_schema
ADD CONSTRAINT schema_type_is_valid_fk
CHECK (is_valid_schema_type(type_id));
"""


CREATE_IS_VALID_PROJECT_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_project_type(project_type_id INTEGER)
RETURNS BOOLEAN AS $$
BEGIN
    IF project_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    RETURN EXISTS (
        SELECT 1 FROM lamindb_project p
        WHERE p.id = project_type_id AND p.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_project
ADD CONSTRAINT project_type_is_valid_fk
CHECK (is_valid_project_type(type_id));
"""


CREATE_IS_VALID_REFERENCE_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_reference_type(reference_type_id INTEGER)
RETURNS BOOLEAN AS $$
BEGIN
    IF reference_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    RETURN EXISTS (
        SELECT 1 FROM lamindb_reference r
        WHERE r.id = reference_type_id AND r.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_reference
ADD CONSTRAINT reference_type_is_valid_fk
CHECK (is_valid_reference_type(type_id));
"""


CREATE_IS_VALID_ULABEL_TYPE_FUNCTION = """
CREATE OR REPLACE FUNCTION is_valid_ulabel_type(ulabel_type_id INTEGER)
RETURNS BOOLEAN AS $$
BEGIN
    IF ulabel_type_id IS NULL THEN
        RETURN TRUE;
    END IF;

    RETURN EXISTS (
        SELECT 1 FROM lamindb_ulabel u
        WHERE u.id = ulabel_type_id AND u.is_type
    );
END;
$$ LANGUAGE plpgsql;
"""

CREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT = """
ALTER TABLE lamindb_ulabel
ADD CONSTRAINT ulabel_type_is_valid_fk
CHECK (is_valid_ulabel_type(type_id));
"""


def apply_constraints(apps, schema_editor):
    if schema_editor.connection.vendor == "postgresql":
        schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_CONSTRAINT)
        schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT)
        schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT)
        schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT)
        schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT)
        schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_FUNCTION)
        schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT)


class Migration(migrations.Migration):
    replaces = [
        ("lamindb", "0177_squashed"),
        ("lamindb", "0177_alter_artifactblock_artifact_and_more"),
        ("lamindb", "0178_v2_2"),
        ("lamindb", "0179_v2_2_part_2"),
        ("lamindb", "0180_v2_2_part_3"),
        ("lamindb", "0181_v2_2_part_4"),
        ("lamindb", "0182_v2_2_part_5"),
        ("lamindb", "0183_v2_2_part_6"),
        ("lamindb", "0184_alter_transformrecord_feature"),
        ("lamindb", "0185_alter_runrecord_feature"),
        ("lamindb", "0186_v2_4"),
        ("lamindb", "0187_v2_4_part_2"),
    ]

    dependencies = []  # type: ignore

    operations = [
        migrations.CreateModel(
            name="Migration",
            fields=[
                (
                    "id",
                    models.BigAutoField(
                        auto_created=True,
                        primary_key=True,
                        serialize=False,
                        verbose_name="ID",
                    ),
                ),
                (
                    "app",
                    lamindb.base.fields.CharField(
                        blank=True, default=None, max_length=255
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True, default=None, max_length=255
                    ),
                ),
                ("applied", lamindb.base.fields.DateTimeField(blank=True)),
            ],
            options={
                "db_table": "django_migrations",
                "managed": False,
            },
        ),
        migrations.CreateModel(
            name="Branch",
            fields=[
                ("id", models.AutoField(primary_key=True, serialize=False)),
                ("name", models.CharField(db_index=True, max_length=100)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
            ],
        ),
        migrations.CreateModel(
            name="Space",
            fields=[
                ("id", models.SmallAutoField(primary_key=True, serialize=False)),
                ("name", models.CharField(db_index=True, max_length=100)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
            ],
        ),
        migrations.CreateModel(
            name="Artifact",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                (
                    "key",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=1024,
                        null=True,
                    ),
                ),
                (
                    "_real_key",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=1024,
                        null=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(
                        blank=True, db_index=True, default=None, null=True
                    ),
                ),
                (
                    "suffix",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=30,
                    ),
                ),
                (
                    "kind",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=20,
                        null=True,
                    ),
                ),
                (
                    "otype",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=64,
                        null=True,
                    ),
                ),
                (
                    "size",
                    lamindb.base.fields.BigIntegerField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        null=True,
                    ),
                ),
                (
                    "hash",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=22,
                        null=True,
                    ),
                ),
                (
                    "n_files",
                    lamindb.base.fields.BigIntegerField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        null=True,
                    ),
                ),
                (
                    "n_observations",
                    lamindb.base.fields.BigIntegerField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        null=True,
                    ),
                ),
                (
                    "_hash_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "_key_is_virtual",
                    lamindb.base.fields.BooleanField(blank=True, default=None),
                ),
                (
                    "_overwrite_versions",
                    lamindb.base.fields.BooleanField(blank=True, default=None),
                ),
                (
                    "_actions",
                    models.ManyToManyField(
                        related_name="_action_targets", to="lamindb.artifact"
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
        ),
        migrations.CreateModel(
            name="ArtifactArtifact",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_artifact",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "value",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_value",
                        to="lamindb.artifact",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="artifact",
            name="artifacts",
            field=models.ManyToManyField(
                related_name="linked_by_artifacts",
                through="lamindb.ArtifactArtifact",
                to="lamindb.artifact",
            ),
        ),
        migrations.CreateModel(
            name="Block",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                ("key", models.CharField(db_index=True, max_length=1024, null=True)),
                (
                    "anchor",
                    models.ForeignKey(
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="children",
                        to="lamindb.block",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "space",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.space",
                    ),
                ),
            ],
        ),
        migrations.CreateModel(
            name="BlockProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "block",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.block",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="artifact",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.CreateModel(
            name="Collection",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_20,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                (
                    "key",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=255
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "hash",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=22,
                        null=True,
                    ),
                ),
                (
                    "reference",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "reference_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=25,
                        null=True,
                    ),
                ),
                (
                    "_actions",
                    models.ManyToManyField(related_name="+", to="lamindb.artifact"),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "meta_artifact",
                    lamindb.base.fields.OneToOneField(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="_meta_of_collection",
                        to="lamindb.artifact",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
        ),
        migrations.CreateModel(
            name="CollectionArtifact",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_collection",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "collection",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_artifact",
                        to="lamindb.collection",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="collection",
            name="artifacts",
            field=models.ManyToManyField(
                related_name="collections",
                through="lamindb.CollectionArtifact",
                to="lamindb.artifact",
            ),
        ),
        migrations.CreateModel(
            name="CollectionProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "collection",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.collection",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="CollectionReference",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "collection",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_reference",
                        to="lamindb.collection",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="Feature",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=150
                    ),
                ),
                (
                    "_dtype_str",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "unit",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                ("array_rank", models.SmallIntegerField(db_index=True, default=0)),
                ("array_size", models.IntegerField(db_index=True, default=0)),
                (
                    "array_shape",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "synonyms",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "default_value",
                    lamindb.base.fields.JSONField(blank=True, default=None, null=True),
                ),
                (
                    "nullable",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "coerce",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="features",
                        to="lamindb.feature",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(lamindb.models.can_curate.CanCurate, models.Model),
        ),
        migrations.CreateModel(
            name="CollectionRecord",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "collection",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_record",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_collectionrecord",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="ArtifactRun",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_run",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactrun",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="ArtifactReference",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_reference",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactreference",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="ArtifactRecord",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_record",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactrecord",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="ArtifactProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactproject",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="artifactartifact",
            name="feature",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifactartifact",
                to="lamindb.feature",
            ),
        ),
        migrations.CreateModel(
            name="FeatureProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.feature",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="JsonValue",
            fields=[
                (
                    "id",
                    models.BigAutoField(
                        auto_created=True,
                        primary_key=True,
                        serialize=False,
                        verbose_name="ID",
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("value", models.JSONField()),
                (
                    "hash",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=22,
                        null=True,
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values",
                        to="lamindb.feature",
                    ),
                ),
            ],
            options={
                "abstract": False,
                "base_manager_name": "objects",
            },
        ),
        migrations.CreateModel(
            name="ArtifactJsonValue",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_jsonvalue",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "jsonvalue",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifact",
                        to="lamindb.jsonvalue",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="artifact",
            name="json_values",
            field=models.ManyToManyField(
                related_name="artifacts",
                through="lamindb.ArtifactJsonValue",
                to="lamindb.jsonvalue",
            ),
        ),
        migrations.CreateModel(
            name="Project",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=255
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "abbr",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=32,
                        null=True,
                    ),
                ),
                (
                    "url",
                    lamindb.base.fields.URLField(
                        blank=True, default=None, max_length=255, null=True
                    ),
                ),
                (
                    "start_date",
                    lamindb.base.fields.DateField(blank=True, default=None, null=True),
                ),
                (
                    "end_date",
                    lamindb.base.fields.DateField(blank=True, default=None, null=True),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="projects",
                        through="lamindb.ArtifactProject",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "blocks",
                    models.ManyToManyField(
                        related_name="projects",
                        through="lamindb.BlockProject",
                        to="lamindb.block",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "collections",
                    models.ManyToManyField(
                        related_name="projects",
                        through="lamindb.CollectionProject",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "features",
                    models.ManyToManyField(
                        related_name="projects",
                        through="lamindb.FeatureProject",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "parents",
                    models.ManyToManyField(
                        related_name="children", to="lamindb.project"
                    ),
                ),
                (
                    "predecessors",
                    models.ManyToManyField(
                        related_name="successors", to="lamindb.project"
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="projects",
                        to="lamindb.project",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(
                lamindb.models.can_curate.CanCurate,
                models.Model,
                lamindb.models.sqlrecord.ValidateFields,
            ),
        ),
        migrations.AddField(
            model_name="featureproject",
            name="project",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_feature",
                to="lamindb.project",
            ),
        ),
        migrations.AddField(
            model_name="collectionproject",
            name="project",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_collection",
                to="lamindb.project",
            ),
        ),
        migrations.CreateModel(
            name="BranchProject",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_branch",
                        to="lamindb.project",
                    ),
                ),
            ],
            options={
                "unique_together": {("branch", "project")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="branch",
            name="projects",
            field=models.ManyToManyField(
                related_name="branches",
                through="lamindb.BranchProject",
                to="lamindb.project",
            ),
        ),
        migrations.AddField(
            model_name="blockproject",
            name="project",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_block",
                to="lamindb.project",
            ),
        ),
        migrations.AddField(
            model_name="artifactproject",
            name="project",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifact",
                to="lamindb.project",
            ),
        ),
        migrations.CreateModel(
            name="Record",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=16,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=150,
                        null=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "reference",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "reference_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=25,
                        null=True,
                    ),
                ),
                ("extra_data", models.JSONField(null=True)),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="records",
                        through="lamindb.ArtifactRecord",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "collections",
                    models.ManyToManyField(
                        related_name="records",
                        through="lamindb.CollectionRecord",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "parents",
                    models.ManyToManyField(
                        related_name="children", to="lamindb.record"
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="records",
                        to="lamindb.record",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(
                lamindb.models.has_parents.HasParents,
                lamindb.models.can_curate.CanCurate,
                models.Model,
            ),
        ),
        migrations.CreateModel(
            name="ProjectRecord",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_projectrecord",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_record",
                        to="lamindb.project",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="project",
            name="records",
            field=models.ManyToManyField(
                related_name="projects",
                through="lamindb.ProjectRecord",
                to="lamindb.record",
            ),
        ),
        migrations.AddField(
            model_name="collectionrecord",
            name="record",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_collection",
                to="lamindb.record",
            ),
        ),
        migrations.AddField(
            model_name="artifactrecord",
            name="record",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifact",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="RecordArtifact",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordartifact",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_artifact",
                        to="lamindb.record",
                    ),
                ),
                (
                    "value",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_in_record",
                        to="lamindb.artifact",
                    ),
                ),
            ],
            options={
                "unique_together": {("record", "feature", "value")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="artifact",
            name="linked_in_records",
            field=models.ManyToManyField(
                related_name="linked_artifacts",
                through="lamindb.RecordArtifact",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="RecordCollection",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordcollection",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_collection",
                        to="lamindb.record",
                    ),
                ),
                (
                    "value",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_in_record",
                        to="lamindb.collection",
                    ),
                ),
            ],
            options={
                "unique_together": {("record", "feature", "value")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="collection",
            name="linked_in_records",
            field=models.ManyToManyField(
                related_name="linked_collections",
                through="lamindb.RecordCollection",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="RecordProject",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordproject",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_project",
                        to="lamindb.record",
                    ),
                ),
                (
                    "value",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_in_record",
                        to="lamindb.project",
                    ),
                ),
            ],
            options={
                "unique_together": {("record", "feature", "value")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="project",
            name="linked_in_records",
            field=models.ManyToManyField(
                related_name="linked_projects",
                through="lamindb.RecordProject",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="RecordRecord",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordrecord",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_record",
                        to="lamindb.record",
                    ),
                ),
                (
                    "value",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_record",
                        to="lamindb.record",
                    ),
                ),
            ],
            options={
                "unique_together": {("record", "feature", "value")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="record",
            name="linked_records",
            field=models.ManyToManyField(
                related_name="linked_in_records",
                through="lamindb.RecordRecord",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="RecordReference",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordreference",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_reference",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.CreateModel(
            name="RecordRun",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordrun",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_run",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.CreateModel(
            name="RecordTransform",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordtransform",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_transform",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.CreateModel(
            name="RecordULabel",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordulabel",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_ulabel",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.CreateModel(
            name="RecordUser",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recorduser",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_user",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.CreateModel(
            name="Reference",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=255
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "abbr",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=32,
                        null=True,
                    ),
                ),
                (
                    "url",
                    lamindb.base.fields.URLField(blank=True, db_index=True, null=True),
                ),
                (
                    "pubmed_id",
                    lamindb.base.fields.BigIntegerField(
                        blank=True, db_index=True, default=None, null=True
                    ),
                ),
                (
                    "doi",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                        validators=[
                            django.core.validators.RegexValidator(
                                message="Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)",
                                regex="^(?:https?://(?:dx\\.)?doi\\.org/|doi:|DOI:)?10\\.\\d+/.*$",
                            )
                        ],
                    ),
                ),
                (
                    "text",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "date",
                    lamindb.base.fields.DateField(blank=True, default=None, null=True),
                ),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="references",
                        through="lamindb.ArtifactReference",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "collections",
                    models.ManyToManyField(
                        related_name="references",
                        through="lamindb.CollectionReference",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "linked_in_records",
                    models.ManyToManyField(
                        related_name="linked_references",
                        through="lamindb.RecordReference",
                        to="lamindb.record",
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="references",
                        to="lamindb.reference",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(
                lamindb.models.can_curate.CanCurate,
                models.Model,
                lamindb.models.sqlrecord.ValidateFields,
            ),
        ),
        migrations.AddField(
            model_name="recordreference",
            name="value",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_in_record",
                to="lamindb.reference",
            ),
        ),
        migrations.AddField(
            model_name="project",
            name="references",
            field=models.ManyToManyField(
                related_name="projects", to="lamindb.reference"
            ),
        ),
        migrations.AddField(
            model_name="collectionreference",
            name="reference",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_collection",
                to="lamindb.reference",
            ),
        ),
        migrations.AddField(
            model_name="artifactreference",
            name="reference",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifact",
                to="lamindb.reference",
            ),
        ),
        migrations.CreateModel(
            name="ReferenceRecord",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_referencerecord",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_reference",
                        to="lamindb.record",
                    ),
                ),
                (
                    "reference",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_record",
                        to="lamindb.reference",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="reference",
            name="records",
            field=models.ManyToManyField(
                related_name="references",
                through="lamindb.ReferenceRecord",
                to="lamindb.record",
            ),
        ),
        migrations.CreateModel(
            name="Run",
            fields=[
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=150,
                        null=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "entrypoint",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "started_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "finished_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True, db_index=True, default=None, null=True
                    ),
                ),
                ("params", models.JSONField(null=True)),
                (
                    "reference",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "reference_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=25,
                        null=True,
                    ),
                ),
                (
                    "cli_args",
                    lamindb.base.fields.CharField(
                        blank=True, default=None, max_length=1024, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_is_consecutive",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=-3, db_index=True, default=-3),
                ),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="runs",
                        through="lamindb.ArtifactRun",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "environment",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="_environment_of",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "initiated_by_run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="initiated_runs",
                        to="lamindb.run",
                    ),
                ),
                (
                    "linked_in_records",
                    models.ManyToManyField(
                        related_name="linked_runs",
                        through="lamindb.RecordRun",
                        to="lamindb.record",
                    ),
                ),
                (
                    "plan",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="_plan_for_runs",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "report",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="_report_of",
                        to="lamindb.artifact",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="referencerecord",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="reference",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="recordrun",
            name="value",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_in_record",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="record",
            name="input_of_runs",
            field=models.ManyToManyField(
                related_name="input_records", to="lamindb.run"
            ),
        ),
        migrations.AddField(
            model_name="record",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                editable=False,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="output_records",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="projectrecord",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="project",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="jsonvalue",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="featureproject",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="feature",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="collectionreference",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="collectionrecord",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="collectionproject",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="collectionartifact",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="collection",
            name="input_of_runs",
            field=models.ManyToManyField(
                related_name="input_collections", to="lamindb.run"
            ),
        ),
        migrations.AddField(
            model_name="collection",
            name="recreating_runs",
            field=models.ManyToManyField(
                related_name="recreated_collections", to="lamindb.run"
            ),
        ),
        migrations.AddField(
            model_name="collection",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="output_collections",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="blockproject",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.CreateModel(
            name="ArtifactUser",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_user",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactuser",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.CreateModel(
            name="ArtifactULabel",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_ulabel",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_artifactulabel",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="artifactrun",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="links_artifact",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifactreference",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifactrecord",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifactproject",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifactjsonvalue",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifactartifact",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.models.run.current_run,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.run",
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="input_of_runs",
            field=models.ManyToManyField(
                related_name="input_artifacts", to="lamindb.run"
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="recreating_runs",
            field=models.ManyToManyField(
                related_name="recreated_artifacts", to="lamindb.run"
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="run",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                editable=False,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="output_artifacts",
                to="lamindb.run",
            ),
        ),
        migrations.CreateModel(
            name="RunArtifact",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_in_run",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_runartifact",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_artifact",
                        to="lamindb.run",
                    ),
                ),
            ],
            options={
                "unique_together": {("run", "artifact", "feature")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="run",
            name="linked_artifacts",
            field=models.ManyToManyField(
                related_name="linked_by_runs",
                through="lamindb.RunArtifact",
                to="lamindb.artifact",
            ),
        ),
        migrations.CreateModel(
            name="RunJsonValue",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "jsonvalue",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_run",
                        to="lamindb.jsonvalue",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_jsonvalue",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="run",
            name="json_values",
            field=models.ManyToManyField(
                related_name="runs",
                through="lamindb.RunJsonValue",
                to="lamindb.jsonvalue",
            ),
        ),
        migrations.CreateModel(
            name="RunProject",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_run",
                        to="lamindb.project",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="project",
            name="runs",
            field=models.ManyToManyField(
                related_name="projects", through="lamindb.RunProject", to="lamindb.run"
            ),
        ),
        migrations.CreateModel(
            name="RunRecord",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_runrecord",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_run",
                        to="lamindb.record",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_record",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="record",
            name="runs",
            field=models.ManyToManyField(
                related_name="records", through="lamindb.RunRecord", to="lamindb.run"
            ),
        ),
        migrations.CreateModel(
            name="Schema",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=16,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=150,
                        null=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "n_members",
                    lamindb.base.fields.IntegerField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "coerce",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "flexible",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "itype",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=120,
                        null=True,
                    ),
                ),
                (
                    "otype",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=64,
                        null=True,
                    ),
                ),
                (
                    "_dtype_str",
                    lamindb.base.fields.CharField(
                        blank=True,
                        default=None,
                        editable=False,
                        max_length=64,
                        null=True,
                    ),
                ),
                (
                    "hash",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=22,
                        null=True,
                    ),
                ),
                (
                    "minimal_set",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True, editable=False
                    ),
                ),
                (
                    "ordered_set",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=False, editable=False
                    ),
                ),
                (
                    "maximal_set",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=False, editable=False
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="schemas",
                        to="lamindb.schema",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(lamindb.models.can_curate.CanCurate, models.Model),
        ),
        migrations.AddField(
            model_name="record",
            name="schema",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                null=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="records",
                to="lamindb.schema",
            ),
        ),
        migrations.CreateModel(
            name="ArtifactSchema",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "slot",
                    lamindb.base.fields.CharField(
                        blank=True, default=None, max_length=255, null=True
                    ),
                ),
                (
                    "feature_ref_is_semantic",
                    lamindb.base.fields.BooleanField(
                        blank=True, default=None, null=True
                    ),
                ),
                (
                    "artifact",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="_links_schema",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "schema",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="_links_artifact",
                        to="lamindb.schema",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="artifact",
            name="schema",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                null=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="validated_artifacts",
                to="lamindb.schema",
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="schemas",
            field=models.ManyToManyField(
                related_name="artifacts",
                through="lamindb.ArtifactSchema",
                to="lamindb.schema",
            ),
        ),
        migrations.CreateModel(
            name="SchemaComponent",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "slot",
                    lamindb.base.fields.CharField(
                        blank=True, default=None, max_length=255, null=True
                    ),
                ),
                (
                    "component",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_composite",
                        to="lamindb.schema",
                    ),
                ),
                (
                    "composite",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_component",
                        to="lamindb.schema",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="schema",
            name="components",
            field=models.ManyToManyField(
                related_name="composites",
                through="lamindb.SchemaComponent",
                to="lamindb.schema",
            ),
        ),
        migrations.CreateModel(
            name="SchemaFeature",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_schema",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "schema",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_feature",
                        to="lamindb.schema",
                    ),
                ),
            ],
            options={
                "unique_together": {("schema", "feature")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="feature",
            name="schemas",
            field=models.ManyToManyField(
                related_name="features",
                through="lamindb.SchemaFeature",
                to="lamindb.schema",
            ),
        ),
        migrations.CreateModel(
            name="SchemaProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_schema",
                        to="lamindb.project",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "schema",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.schema",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="project",
            name="schemas",
            field=models.ManyToManyField(
                related_name="projects",
                through="lamindb.SchemaProject",
                to="lamindb.schema",
            ),
        ),
        migrations.AddField(
            model_name="schema",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="run",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="reference",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="record",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="project",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="jsonvalue",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="feature",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="collection",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="branch",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="space",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.space",
            ),
        ),
        migrations.CreateModel(
            name="Storage",
            fields=[
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_12,
                        editable=False,
                        max_length=12,
                        unique=True,
                    ),
                ),
                (
                    "root",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        unique=True,
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "type",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=30
                    ),
                ),
                (
                    "region",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=64,
                        null=True,
                    ),
                ),
                (
                    "instance_uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=12,
                        null=True,
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "space",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.space",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
        ),
        migrations.AddField(
            model_name="artifact",
            name="storage",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="artifacts",
                to="lamindb.storage",
            ),
        ),
        migrations.CreateModel(
            name="Transform",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=16,
                        unique=True,
                    ),
                ),
                (
                    "key",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=1024
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(
                        blank=True, db_index=True, default=None, null=True
                    ),
                ),
                (
                    "kind",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default="pipeline", max_length=20
                    ),
                ),
                (
                    "source_code",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "hash",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=22,
                        null=True,
                    ),
                ),
                (
                    "reference",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "reference_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=25,
                        null=True,
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "environment",
                    models.ForeignKey(
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="_environment_of_transforms",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "linked_in_records",
                    models.ManyToManyField(
                        related_name="linked_transforms",
                        through="lamindb.RecordTransform",
                        to="lamindb.record",
                    ),
                ),
                (
                    "plan",
                    models.ForeignKey(
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="_plan_for_transforms",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "space",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.space",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
        ),
        migrations.AddField(
            model_name="run",
            name="transform",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="runs",
                to="lamindb.transform",
            ),
        ),
        migrations.AddField(
            model_name="recordtransform",
            name="value",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_in_record",
                to="lamindb.transform",
            ),
        ),
        migrations.CreateModel(
            name="TransformProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_transform",
                        to="lamindb.project",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "transform",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.transform",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="project",
            name="transforms",
            field=models.ManyToManyField(
                related_name="projects",
                through="lamindb.TransformProject",
                to="lamindb.transform",
            ),
        ),
        migrations.CreateModel(
            name="TransformRecord",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        editable=False,
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_transformrecord",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_transform",
                        to="lamindb.record",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "transform",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_record",
                        to="lamindb.transform",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="record",
            name="transforms",
            field=models.ManyToManyField(
                related_name="records",
                through="lamindb.TransformRecord",
                to="lamindb.transform",
            ),
        ),
        migrations.CreateModel(
            name="TransformReference",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "reference",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_transform",
                        to="lamindb.reference",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "transform",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_reference",
                        to="lamindb.transform",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="reference",
            name="transforms",
            field=models.ManyToManyField(
                related_name="references",
                through="lamindb.TransformReference",
                to="lamindb.transform",
            ),
        ),
        migrations.CreateModel(
            name="TransformTransform",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                ("config", models.JSONField(default=None, null=True)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        editable=False,
                    ),
                ),
                (
                    "predecessor",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_successor",
                        to="lamindb.transform",
                    ),
                ),
                (
                    "successor",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_predecessor",
                        to="lamindb.transform",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="transform",
            name="predecessors",
            field=models.ManyToManyField(
                related_name="successors",
                through="lamindb.TransformTransform",
                to="lamindb.transform",
            ),
        ),
        migrations.CreateModel(
            name="ULabel",
            fields=[
                (
                    "is_type",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, db_index=True, default=False
                    ),
                ),
                (
                    "is_locked",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_default=False, default=False
                    ),
                ),
                (
                    "_aux",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None, null=True
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=lamindb.base.uids.base62_8,
                        editable=False,
                        max_length=8,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=150
                    ),
                ),
                (
                    "description",
                    lamindb.base.fields.TextField(blank=True, default=None, null=True),
                ),
                (
                    "reference",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=255,
                        null=True,
                    ),
                ),
                (
                    "reference_type",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=25,
                        null=True,
                    ),
                ),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="ulabels",
                        through="lamindb.ArtifactULabel",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "linked_in_records",
                    models.ManyToManyField(
                        related_name="linked_ulabels",
                        through="lamindb.RecordULabel",
                        to="lamindb.record",
                    ),
                ),
                (
                    "parents",
                    models.ManyToManyField(
                        related_name="children", to="lamindb.ulabel"
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "space",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.space",
                    ),
                ),
                (
                    "type",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="ulabels",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            options={
                "abstract": False,
            },
            bases=(
                lamindb.models.has_parents.HasParents,
                lamindb.models.can_curate.CanCurate,
                models.Model,
            ),
        ),
        migrations.CreateModel(
            name="TransformULabel",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "transform",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_ulabel",
                        to="lamindb.transform",
                    ),
                ),
                (
                    "ulabel",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_transform",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="transform",
            name="ulabels",
            field=models.ManyToManyField(
                related_name="transforms",
                through="lamindb.TransformULabel",
                to="lamindb.ulabel",
            ),
        ),
        migrations.CreateModel(
            name="RunULabel",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_ulabel",
                        to="lamindb.run",
                    ),
                ),
                (
                    "ulabel",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_run",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="run",
            name="ulabels",
            field=models.ManyToManyField(
                related_name="runs", through="lamindb.RunULabel", to="lamindb.ulabel"
            ),
        ),
        migrations.AddField(
            model_name="recordulabel",
            name="value",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_record",
                to="lamindb.ulabel",
            ),
        ),
        migrations.CreateModel(
            name="CollectionULabel",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "collection",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_ulabel",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=None,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_collectionulabel",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "ulabel",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_collection",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="collection",
            name="ulabels",
            field=models.ManyToManyField(
                related_name="collections",
                through="lamindb.CollectionULabel",
                to="lamindb.ulabel",
            ),
        ),
        migrations.CreateModel(
            name="BranchULabel",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_ulabel",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "ulabel",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_branch",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            options={
                "unique_together": {("branch", "ulabel")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="branch",
            name="ulabels",
            field=models.ManyToManyField(
                related_name="branches",
                through="lamindb.BranchULabel",
                to="lamindb.ulabel",
            ),
        ),
        migrations.AddField(
            model_name="artifactulabel",
            name="ulabel",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifact",
                to="lamindb.ulabel",
            ),
        ),
        migrations.CreateModel(
            name="ULabelProject",
            fields=[
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_ulabel",
                        to="lamindb.project",
                    ),
                ),
                (
                    "run",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        default=lamindb.models.run.current_run,
                        null=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.run",
                    ),
                ),
                (
                    "ulabel",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_project",
                        to="lamindb.ulabel",
                    ),
                ),
            ],
            bases=(lamindb.models.sqlrecord.IsLink, models.Model),
        ),
        migrations.AddField(
            model_name="project",
            name="ulabels",
            field=models.ManyToManyField(
                related_name="projects",
                through="lamindb.ULabelProject",
                to="lamindb.ulabel",
            ),
        ),
        migrations.CreateModel(
            name="User",
            fields=[
                ("id", models.AutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        editable=False,
                        max_length=8,
                        unique=True,
                    ),
                ),
                (
                    "handle",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        unique=True,
                    ),
                ),
                (
                    "name",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=150,
                        null=True,
                    ),
                ),
                (
                    "created_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "updated_at",
                    lamindb.base.fields.DateTimeField(
                        blank=True,
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "artifacts",
                    models.ManyToManyField(
                        related_name="users",
                        through="lamindb.ArtifactUser",
                        to="lamindb.artifact",
                        through_fields=("user", "artifact"),
                    ),
                ),
                (
                    "linked_in_records",
                    models.ManyToManyField(
                        related_name="linked_users",
                        through="lamindb.RecordUser",
                        to="lamindb.record",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.can_curate.CanCurate),
        ),
        migrations.AddField(
            model_name="ulabelproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="ULabelBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "branch",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "ulabel",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.ulabel",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=lamindb.base.users.current_user_id,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="ulabel",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="transformulabel",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="transformtransform",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="transformreference",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="transformrecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="transformproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="TransformBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                ("line_number", models.IntegerField(null=True)),
                (
                    "branch",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "transform",
                    models.ForeignKey(
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.transform",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=lamindb.base.users.current_user_id,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="transform",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="created_transforms",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="storage",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="SpaceBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "branch",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "space",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.space",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=lamindb.base.users.current_user_id,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="space",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=None,
                null=True,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="schemaproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="schemacomponent",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="SchemaBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "branch",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "schema",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.schema",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=lamindb.base.users.current_user_id,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="schema",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="runulabel",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="runrecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="runproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="runjsonvalue",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="RunBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "branch",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "run",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.run",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=lamindb.base.users.current_user_id,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="run",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.CASCADE,
                related_name="created_runs",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="referencerecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="reference",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="recorduser",
            name="value",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_record",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="RecordBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "branch",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "record",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.record",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=lamindb.base.users.current_user_id,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="record",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="ProjectUser",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "role",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=32
                    ),
                ),
                (
                    "project",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_user",
                        to="lamindb.project",
                    ),
                ),
                (
                    "user",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_project",
                        to="lamindb.user",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AddField(
            model_name="projectrecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="ProjectBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "branch",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "project",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.project",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=lamindb.base.users.current_user_id,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="project",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="project",
            name="users",
            field=models.ManyToManyField(
                related_name="projects",
                through="lamindb.ProjectUser",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="jsonvalue",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="featureproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="FeatureBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "branch",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "feature",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=lamindb.base.users.current_user_id,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="feature",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="collectionulabel",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="collectionreference",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="collectionrecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="collectionproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="CollectionBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "branch",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "collection",
                    models.ForeignKey(
                        null=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.collection",
                    ),
                ),
                (
                    "created_on",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=lamindb.base.users.current_user_id,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="collectionartifact",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="collection",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="BranchUser",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "role",
                    lamindb.base.fields.CharField(
                        blank=True, db_index=True, default=None, max_length=32
                    ),
                ),
                (
                    "branch",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="links_user",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "user",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_branch",
                        to="lamindb.user",
                    ),
                ),
            ],
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.CreateModel(
            name="BranchBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "branch",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=lamindb.base.users.current_user_id,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="branch",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="branch",
            name="users",
            field=models.ManyToManyField(
                related_name="branches", through="lamindb.BranchUser", to="lamindb.user"
            ),
        ),
        migrations.AddField(
            model_name="blockproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="block",
            name="created_by",
            field=models.ForeignKey(
                default=lamindb.base.users.current_user_id,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactuser",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactuser",
            name="user",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="links_artifact",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactulabel",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactschema",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactrun",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactreference",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactrecord",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactproject",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifactjsonvalue",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="ArtifactBlock",
            fields=[
                (
                    "version_tag",
                    lamindb.base.fields.CharField(
                        blank=True,
                        db_index=True,
                        default=None,
                        max_length=30,
                        null=True,
                    ),
                ),
                (
                    "is_latest",
                    lamindb.base.fields.BooleanField(
                        blank=True, db_index=True, default=True
                    ),
                ),
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "uid",
                    models.CharField(
                        db_index=True,
                        default=lamindb.base.uids.base62_16,
                        editable=False,
                        max_length=20,
                        unique=True,
                    ),
                ),
                ("content", models.TextField()),
                ("hash", models.CharField(db_index=True, max_length=22, null=True)),
                (
                    "kind",
                    models.CharField(
                        db_default="readme",
                        db_index=True,
                        default="readme",
                        max_length=22,
                    ),
                ),
                (
                    "created_at",
                    models.DateTimeField(
                        db_default=django.db.models.functions.datetime.Now(),
                        db_index=True,
                        editable=False,
                    ),
                ),
                (
                    "_status_code",
                    models.SmallIntegerField(db_default=0, db_index=True, default=0),
                ),
                ("_aux", models.JSONField(db_default=None, default=None, null=True)),
                (
                    "artifact",
                    models.ForeignKey(
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="ablocks",
                        to="lamindb.artifact",
                    ),
                ),
                (
                    "branch",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_on",
                    models.ForeignKey(
                        db_default=1,
                        default=1,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.branch",
                    ),
                ),
                (
                    "created_by",
                    models.ForeignKey(
                        default=lamindb.base.users.current_user_id,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="+",
                        to="lamindb.user",
                    ),
                ),
            ],
        ),
        migrations.AddField(
            model_name="artifactartifact",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.user",
            ),
        ),
        migrations.AddField(
            model_name="artifact",
            name="created_by",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                default=lamindb.base.users.current_user_id,
                editable=False,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="created_artifacts",
                to="lamindb.user",
            ),
        ),
        migrations.CreateModel(
            name="RecordJson",
            fields=[
                ("id", models.BigAutoField(primary_key=True, serialize=False)),
                (
                    "value",
                    lamindb.base.fields.JSONField(
                        blank=True, db_default=None, default=None
                    ),
                ),
                (
                    "feature",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.PROTECT,
                        related_name="links_recordjson",
                        to="lamindb.feature",
                    ),
                ),
                (
                    "record",
                    lamindb.base.fields.ForeignKey(
                        blank=True,
                        on_delete=django.db.models.deletion.CASCADE,
                        related_name="values_json",
                        to="lamindb.record",
                    ),
                ),
            ],
            options={
                "unique_together": {("record", "feature")},
            },
            bases=(models.Model, lamindb.models.sqlrecord.IsLink),
        ),
        migrations.AlterUniqueTogether(
            name="recordreference",
            unique_together={("record", "feature", "value")},
        ),
        migrations.AlterUniqueTogether(
            name="recordrun",
            unique_together={("record", "feature", "value")},
        ),
        migrations.AlterUniqueTogether(
            name="recordtransform",
            unique_together={("record", "feature", "value")},
        ),
        migrations.AlterUniqueTogether(
            name="recordulabel",
            unique_together={("record", "feature", "value")},
        ),
        migrations.AlterUniqueTogether(
            name="ulabelproject",
            unique_together={("ulabel", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="transformulabel",
            unique_together={("transform", "ulabel")},
        ),
        migrations.AlterUniqueTogether(
            name="transformtransform",
            unique_together={("successor", "predecessor")},
        ),
        migrations.AlterUniqueTogether(
            name="transformreference",
            unique_together={("transform", "reference")},
        ),
        migrations.AlterUniqueTogether(
            name="transformrecord",
            unique_together={("transform", "record", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="transformproject",
            unique_together={("transform", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="transform",
            unique_together={("key", "hash")},
        ),
        migrations.AddConstraint(
            model_name="space",
            constraint=models.UniqueConstraint(
                django.db.models.functions.text.Lower("name"),
                name="unique_space_name_lower",
            ),
        ),
        migrations.AlterUniqueTogether(
            name="schemaproject",
            unique_together={("schema", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="schemacomponent",
            unique_together={("composite", "slot"), ("composite", "slot", "component")},
        ),
        migrations.AlterUniqueTogether(
            name="runulabel",
            unique_together={("run", "ulabel")},
        ),
        migrations.AlterUniqueTogether(
            name="runrecord",
            unique_together={("run", "record", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="runproject",
            unique_together={("run", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="runjsonvalue",
            unique_together={("run", "jsonvalue")},
        ),
        migrations.AlterUniqueTogether(
            name="referencerecord",
            unique_together={("reference", "feature", "record")},
        ),
        migrations.AlterUniqueTogether(
            name="recorduser",
            unique_together={("record", "feature", "value")},
        ),
        migrations.AlterUniqueTogether(
            name="projectuser",
            unique_together={("project", "user", "role")},
        ),
        migrations.AlterUniqueTogether(
            name="projectrecord",
            unique_together={("project", "feature", "record")},
        ),
        migrations.AlterUniqueTogether(
            name="jsonvalue",
            unique_together={("feature", "hash")},
        ),
        migrations.AlterUniqueTogether(
            name="featureproject",
            unique_together={("feature", "project")},
        ),
        migrations.AddConstraint(
            model_name="feature",
            constraint=models.CheckConstraint(
                condition=models.Q(
                    ("is_type", True), ("_dtype_str__isnull", False), _connector="OR"
                ),
                name="feature_dtype_str_not_null_when_is_type_false",
            ),
        ),
        migrations.AlterUniqueTogether(
            name="collectionulabel",
            unique_together={("collection", "ulabel")},
        ),
        migrations.AlterUniqueTogether(
            name="collectionreference",
            unique_together={("collection", "reference")},
        ),
        migrations.AlterUniqueTogether(
            name="collectionrecord",
            unique_together={("collection", "record", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="collectionproject",
            unique_together={("collection", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="collectionartifact",
            unique_together={("collection", "artifact")},
        ),
        migrations.AddConstraint(
            model_name="collection",
            constraint=models.UniqueConstraint(
                fields=("key", "hash"), name="unique_collection_key_hash_not_null"
            ),
        ),
        migrations.AlterUniqueTogether(
            name="branchuser",
            unique_together={("branch", "user", "role")},
        ),
        migrations.AddConstraint(
            model_name="branch",
            constraint=models.UniqueConstraint(
                django.db.models.functions.text.Lower("name"),
                name="unique_branch_name_lower",
            ),
        ),
        migrations.AlterUniqueTogether(
            name="blockproject",
            unique_together={("block", "project")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactuser",
            unique_together={("artifact", "user", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactulabel",
            unique_together={("artifact", "ulabel", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactschema",
            unique_together={("artifact", "schema"), ("artifact", "slot")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactrun",
            unique_together={("artifact", "run", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactreference",
            unique_together={("artifact", "reference", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactrecord",
            unique_together={("artifact", "record", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactproject",
            unique_together={("artifact", "project", "feature")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactjsonvalue",
            unique_together={("artifact", "jsonvalue")},
        ),
        migrations.AlterUniqueTogether(
            name="artifactartifact",
            unique_together={("artifact", "value", "feature")},
        ),
        migrations.AddConstraint(
            model_name="artifact",
            constraint=models.UniqueConstraint(
                condition=models.Q(("key__isnull", False)),
                fields=("storage", "key", "hash"),
                name="unique_artifact_storage_key_hash_not_null",
            ),
        ),
        migrations.AddConstraint(
            model_name="artifact",
            constraint=models.UniqueConstraint(
                condition=models.Q(("key__isnull", True)),
                fields=("storage", "hash"),
                name="unique_artifact_storage_hash_null_key",
            ),
        ),
        migrations.RunPython(apply_constraints),
    ]


if connection.vendor == "postgresql":
    Migration.operations += [
        pgtrigger.migrations.AddTrigger(
            model_name="ulabel",
            trigger=pgtrigger.compiler.Trigger(
                name="prevent_ulabel_type_cycle",
                sql=pgtrigger.compiler.UpsertTriggerSql(
                    condition="WHEN (NEW.type_id IS NOT NULL)",
                    func="\n                        -- Check for direct self-reference\n                        IF NEW.type_id = NEW.id THEN\n                            RAISE EXCEPTION 'Cannot set type: ulabel cannot be its own type';\n                        END IF;\n\n                        -- Check for cycles in the type chain\n                        IF EXISTS (\n                            WITH RECURSIVE type_chain AS (\n                                SELECT type_id, 1 as depth\n                                FROM lamindb_ulabel\n                                WHERE id = NEW.type_id\n\n                                UNION ALL\n\n                                SELECT r.type_id, tc.depth + 1\n                                FROM lamindb_ulabel r\n                                INNER JOIN type_chain tc ON r.id = tc.type_id\n                                WHERE tc.depth < 100\n                            )\n                            SELECT 1 FROM type_chain WHERE type_id = NEW.id\n                        ) THEN\n                            RAISE EXCEPTION 'Cannot set type: would create a cycle';\n                        END IF;\n\n                        RETURN NEW;\n                    ",
                    hash="53487a8e36a64748418457f7229de6d5cf31e6bd",
                    operation="UPDATE OR INSERT",
                    pgid="pgtrigger_prevent_ulabel_type_cycle_863ae",
                    table="lamindb_ulabel",
                    when="BEFORE",
                ),
            ),
        ),
        pgtrigger.migrations.AddTrigger(
            model_name="record",
            trigger=pgtrigger.compiler.Trigger(
                name="prevent_record_type_cycle",
                sql=pgtrigger.compiler.UpsertTriggerSql(
                    condition="WHEN (NEW.type_id IS NOT NULL)",
                    func="\n                        -- Check for direct self-reference\n                        IF NEW.type_id = NEW.id THEN\n                            RAISE EXCEPTION 'Cannot set type: record cannot be its own type';\n                        END IF;\n\n                        -- Check for cycles in the type chain\n                        IF EXISTS (\n                            WITH RECURSIVE type_chain AS (\n                                SELECT type_id, 1 as depth\n                                FROM lamindb_record\n                                WHERE id = NEW.type_id\n\n                                UNION ALL\n\n                                SELECT r.type_id, tc.depth + 1\n                                FROM lamindb_record r\n                                INNER JOIN type_chain tc ON r.id = tc.type_id\n                                WHERE tc.depth < 100\n                            )\n                            SELECT 1 FROM type_chain WHERE type_id = NEW.id\n                        ) THEN\n                            RAISE EXCEPTION 'Cannot set type: would create a cycle';\n                        END IF;\n\n                        RETURN NEW;\n                    ",
                    hash="deaab832a066dfec76228f5b7a62a08f334876a9",
                    operation="UPDATE OR INSERT",
                    pgid="pgtrigger_prevent_record_type_cycle_56c18",
                    table="lamindb_record",
                    when="BEFORE",
                ),
            ),
        ),
        pgtrigger.migrations.AddTrigger(
            model_name="feature",
            trigger=pgtrigger.compiler.Trigger(
                name="update_feature_on_name_change",
                sql=pgtrigger.compiler.UpsertTriggerSql(
                    condition="WHEN (OLD.name IS DISTINCT FROM NEW.name)",
                    func="DECLARE\n    old_renamed JSONB;\n    new_renamed JSONB;\n    ts TEXT;\nBEGIN\n    -- Only proceed if name actually changed\n    IF OLD.name IS DISTINCT FROM NEW.name THEN\n        -- Update synonyms\n        IF NEW.synonyms IS NULL OR NEW.synonyms = '' THEN\n            NEW.synonyms := OLD.name;\n        ELSIF position(OLD.name in NEW.synonyms) = 0 THEN\n            NEW.synonyms := NEW.synonyms || '|' || OLD.name;\n        END IF;\n\n        -- Update _aux with rename history\n        ts := TO_CHAR(NOW() AT TIME ZONE 'UTC', 'YYYY-MM-DD\"T\"HH24:MI:SS\"Z\"');\n\n        -- Get existing renamed history or initialize empty object\n        old_renamed := COALESCE((OLD._aux->>'renamed')::JSONB, '{}'::JSONB);\n\n        -- Add old name with timestamp\n        new_renamed := old_renamed || jsonb_build_object(ts, OLD.name);\n\n        -- Update _aux with new renamed history\n        IF NEW._aux IS NULL THEN\n            NEW._aux := jsonb_build_object('renamed', new_renamed);\n        ELSE\n            NEW._aux := NEW._aux || jsonb_build_object('renamed', new_renamed);\n        END IF;\n    END IF;\n\n    RETURN NEW;\nEND;\n",
                    hash="5f2e7a65e42c34b0455f0840def52f078726e401",
                    operation="UPDATE",
                    pgid="pgtrigger_update_feature_on_name_change_6c32d",
                    table="lamindb_feature",
                    when="BEFORE",
                ),
            ),
        ),
    ]


================================================
FILE: lamindb/migrations/0187_v2_4_part_2.py
================================================
# Generated by Django 5.2 on 2026-04-16 06:38

import django.db.models.deletion
from django.db import migrations

import lamindb.base.fields


class Migration(migrations.Migration):
    dependencies = [
        ("lamindb", "0186_v2_4"),
    ]

    operations = [
        migrations.RemoveField(
            model_name="branchblock",
            name="created_on",
        ),
        migrations.AlterField(
            model_name="block",
            name="branch",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
        migrations.AlterField(
            model_name="block",
            name="created_on",
            field=lamindb.base.fields.ForeignKey(
                blank=True,
                db_default=1,
                default=1,
                on_delete=django.db.models.deletion.PROTECT,
                related_name="+",
                to="lamindb.branch",
            ),
        ),
    ]


================================================
FILE: lamindb/migrations/README.md
================================================
# Attention

Remember that lamindb schema changes that do not work on old databases (like adding columns or tables) cannot be deployed to cloud functions unless these instances are migrated.


================================================
FILE: lamindb/migrations/__init__.py
================================================


================================================
FILE: lamindb/models/__init__.py
================================================
"""Auxiliary models & database library.

Registry basics
---------------

.. autoclass:: BaseSQLRecord
.. autoclass:: SQLRecord
.. autoclass:: Registry
.. autoclass:: BasicQuerySet
.. autoclass:: QuerySet

Mixins for registries
---------------------

.. autoclass:: IsVersioned
.. autoclass:: HasType
.. autoclass:: HasParents
.. autoclass:: CanCurate
.. autoclass:: TracksRun
.. autoclass:: TracksUpdates

Managers
--------

.. autoclass:: FeatureManager
.. autoclass:: LabelManager
.. autoclass:: QueryManager
.. autoclass:: RelatedManager

Annotations of objects
----------------------

Artifact, run, collection, annotations can be conditioned on features.
Besides linking categorical data, you can also link simple data types
by virtue of the `JsonValue` model.

.. autoclass:: JsonValue

Annotating artifacts.

.. autoclass:: ArtifactArtifact
.. autoclass:: ArtifactJsonValue
.. autoclass:: ArtifactProject
.. autoclass:: ArtifactRecord
.. autoclass:: ArtifactReference
.. autoclass:: ArtifactRun
.. autoclass:: ArtifactSchema
.. autoclass:: ArtifactULabel
.. autoclass:: ArtifactUser

Annotating collections.

.. autoclass:: CollectionArtifact
.. autoclass:: CollectionProject
.. autoclass:: CollectionReference
.. autoclass:: CollectionULabel
.. autoclass:: CollectionRecord

Annotating runs.

.. autoclass:: RunJsonValue
.. autoclass:: RunProject
.. autoclass:: RunULabel
.. autoclass:: RunRecord

Annotating transforms.

.. autoclass:: TransformProject
.. autoclass:: TransformReference
.. autoclass:: TransformULabel

Building relationships among transforms.

.. autoclass:: TransformTransform

Annotating features, blocks, and ulabels with projects.

.. autoclass:: FeatureProject
.. autoclass:: BlockProject
.. autoclass:: ULabelProject
.. autoclass:: SchemaProject
.. autoclass:: ProjectRecord

Building schemas.

.. autoclass:: SchemaComponent
.. autoclass:: SchemaFeature

Annotating references with records.

.. autoclass:: ReferenceRecord

Record values
-------------

Record values work almost exactly like artifact and run annotations,
with the exception that JSON values are stored in `RecordJson` on a per-record basis
and not in `JsonValue`.

.. autoclass:: RecordArtifact
.. autoclass:: RecordCollection
.. autoclass:: RecordJson
.. autoclass:: RecordProject
.. autoclass:: RecordRecord
.. autoclass:: RecordReference
.. autoclass:: RecordRun
.. autoclass:: RecordTransform
.. autoclass:: RecordULabel
.. autoclass:: RecordUser
.. autoclass:: TransformRecord

Blocks
------

.. autoclass:: BaseBlock
.. autoclass:: Block
.. autoclass:: ArtifactBlock
.. autoclass:: BranchBlock
.. autoclass:: CollectionBlock
.. autoclass:: FeatureBlock
.. autoclass:: ProjectBlock
.. autoclass:: RecordBlock
.. autoclass:: RunBlock
.. autoclass:: SchemaBlock
.. autoclass:: SpaceBlock
.. autoclass:: TransformBlock
.. autoclass:: ULabelBlock

Utils
-----

.. autoclass:: LazyArtifact
.. autoclass:: InspectResult
.. autoclass:: ValidateFields
.. autoclass:: SchemaOptionals
.. autoclass:: lamindb.models.query_set.BiontyDB
.. autoclass:: lamindb.models.query_set.PertdbDB

"""

# ruff: noqa: I001

from lamin_utils._inspect import InspectResult
from ._is_versioned import IsVersioned
from .can_curate import CanCurate
from .sqlrecord import (
    BaseSQLRecord,
    SQLRecord,
    Registry,
    Space,
    Branch,
    Migration,
    ValidateFields,
    format_field_value,
    IsLink,
    HasType,
)
from .storage import Storage
from .transform import Transform, TransformTransform
from .run import Run, TracksRun, TracksUpdates, current_run, User
from .feature import Feature, JsonValue
from .schema import Schema
from .ulabel import ULabel

# should come last as it needs everything else
from .artifact import Artifact, LazyArtifact
from ._feature_manager import FeatureManager
from ._label_manager import LabelManager
from .collection import Collection, CollectionArtifact
from .project import Project, Reference
from .query_manager import RelatedManager, QueryManager
from .query_set import BasicQuerySet, QuerySet, DB, SQLRecordList
from .artifact_set import ArtifactSet
from .has_parents import HasParents
from datetime import datetime as _datetime

# link models
from .artifact import ArtifactJsonValue, ArtifactArtifact, ArtifactUser, ArtifactRun
from .project import (
    ArtifactProject,
    ArtifactReference,
    BlockProject,
    CollectionProject,
    CollectionReference,
    FeatureProject,
    ProjectRecord,
    RecordProject,
    RecordReference,
    ReferenceRecord,
    RunProject,
    SchemaProject,
    TransformProject,
    TransformReference,
    ULabelProject,
)
from .run import RunJsonValue
from .schema import (
    SchemaFeature,
    ArtifactSchema,
    SchemaComponent,
    SchemaOptionals,
)
from .ulabel import ArtifactULabel, TransformULabel, RunULabel, CollectionULabel

from .record import (
    Record,
    ArtifactRecord,
    CollectionRecord,
    RecordArtifact,
    RecordCollection,
    RecordJson,
    RecordRecord,
    RecordRun,
    RecordTransform,
    RecordULabel,
    RecordUser,
    RunRecord,
    TransformRecord,
)
from .block import (
    BaseBlock,
    Block,
    ArtifactBlock,
    BranchBlock,
    CollectionBlock,
    FeatureBlock,
    ProjectBlock,
    RecordBlock,
    RunBlock,
    SchemaBlock,
    SpaceBlock,
    TransformBlock,
    ULabelBlock,
)

FeatureValue = JsonValue  # backward compatibility


================================================
FILE: lamindb/models/_describe.py
================================================
from __future__ import annotations

import re
from types import SimpleNamespace
from typing import TYPE_CHECKING, Literal

from django.db import connections
from django.db.models import Q
from lamin_utils import colors, logger
from rich.table import Column, Table
from rich.text import Text
from rich.tree import Tree

from lamindb.models import BaseSQLRecord, Branch, Run

from ._is_versioned import IsVersioned
from .sqlrecord import SQLRecord, format_field_value

if TYPE_CHECKING:
    from lamindb.models import Artifact, Collection, Record, Schema, Transform

    from .run import TracksRun


# Define consistent column widths for use in other modules
NAME_WIDTH = 30
TYPE_WIDTH = 35  # types can get long, e.g. cat[Record[Treatment]]
VALUES_WIDTH = 40


def strip_ansi_from_string(text: str) -> str:
    """Remove ANSI escape sequences from a string."""
    ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
    return ansi_escape.sub("", text)


def format_rich_tree(
    tree: Tree, return_str: bool = False, strip_ansi: bool = True
) -> str | None:
    from rich.console import Console

    from ..core._context import is_run_from_ipython

    console = Console(force_terminal=True)
    printed = False

    if return_str:
        from io import StringIO

        string_io = StringIO()
        str_console = Console(file=string_io, force_terminal=True)
        str_console.print(tree)
        result = string_io.getvalue()
        if strip_ansi:
            result = strip_ansi_from_string(result)
        # rstrip trailing whitespace on every line
        result = "\n".join(line.rstrip() for line in result.splitlines())
        return result

    try:
        if not is_run_from_ipython:
            from IPython import get_ipython
            from IPython.core.interactiveshell import InteractiveShell
            from IPython.display import display

            shell = get_ipython()
            if isinstance(shell, InteractiveShell):
                display(tree)
                printed = True
                return None
    except (NameError, ImportError):
        pass

    if not printed:
        # be careful to test this on a terminal
        console = Console(force_terminal=True)
        console.print(tree)

    return None


def format_run_title(
    record: Run | SimpleNamespace | None,
    transform_key: str | None = None,
    dim: bool = False,
) -> Text:
    if record is None:
        return Text("")
    display_name = (
        Text(record.name, style="cyan3")
        if record.name is not None
        else Text(record.uid[:7], style="cyan3")
    )
    if transform_key is None:
        transform_key = record.transform.key
    title = Text.assemble(
        display_name,
        (" (", "dim"),
        (transform_key, "cyan3"),
        (")", "dim"),
    )
    return title


def format_title_with_version(
    record: IsVersioned | SimpleNamespace,
) -> Text:
    title_str = record.key if record.key is not None else ""
    title = Text.assemble(
        (title_str, "cyan3"),
        (f" ({record.version})", "dim"),
        Text.assemble(("\n|   description: ", "dim"), record.description)
        if record.description
        else Text(""),
    )
    return title


def describe_header(record: BaseSQLRecord) -> Tree:
    if isinstance(record, IsVersioned) and not record.is_latest:
        logger.warning(
            f"This is not the latest version of the {record.__class__.__name__}."
        )
    if isinstance(record, SQLRecord):
        if record.branch_id == 0:
            logger.warning("This artifact is archived.")
        elif record.branch_id == -1:
            logger.warning("This artifact is in the trash.")
    if isinstance(record, Run):
        title = format_run_title(record, dim=True)  # dim makes the uid grey
    elif isinstance(record, IsVersioned) or isinstance(record, SimpleNamespace):
        title = format_title_with_version(record)
    else:
        display_field = (
            record._name_field
            if hasattr(record, "_name_field")
            else "name"
            if hasattr(record, "name")
            else ""
        )
        display_value = getattr(record, display_field, None) if display_field else None
        if display_value in (None, ""):
            display_value = record.uid[:7] if hasattr(record, "uid") else ""
        title = Text.assemble(
            (
                str(display_value),
                "cyan3",
            )
        )
    tree = Tree(
        Text.assemble(
            (f"{record.__class__.__name__}: ", "bold"),
            title,
        ),
        guide_style="dim",  # dim the connecting lines
    )
    return tree


def format_bytes(bytes_value):
    """Convert bytes to human readable format."""
    if bytes_value < 1024:
        return f"{bytes_value} B"
    elif bytes_value < 1024**2:
        return f"{bytes_value / 1024:.1f} KB"
    elif bytes_value < 1024**3:
        return f"{bytes_value / (1024**2):.1f} MB"
    elif bytes_value < 1024**4:
        return f"{bytes_value / (1024**3):.1f} GB"
    else:
        return f"{bytes_value / (1024**4):.1f} TB"


def append_uid_run(record: TracksRun, two_column_items: list, fk_data=None) -> None:
    if fk_data and "run" in fk_data and fk_data["run"] and fk_data["run"]["id"]:
        run, transform_key = (
            SimpleNamespace(**fk_data["run"]),
            fk_data["run"]["transform_key"],
        )
    elif record.run is not None:
        run, transform_key = record.run, record.run.transform.key
    else:
        run, transform_key = None, None
    text_uid = Text.assemble(("uid: ", "dim"), f"{record.uid}")
    text_run = Text.assemble(
        ("run: ", "dim"), format_run_title(run, transform_key=transform_key)
    )
    two_column_items.append(text_uid)
    two_column_items.append(text_run)


def append_branch_space_created_at_created_by(
    record: SQLRecord, two_column_items, fk_data=None
):
    # branch
    branch_name = fk_data["branch"]["name"] if fk_data else record.branch.name
    two_column_items.append(Text.assemble(("branch: ", "dim"), branch_name))
    # space
    space_name = fk_data["space"]["name"] if fk_data else record.space.name
    two_column_items.append(Text.assemble(("space: ", "dim"), space_name))
    # created_at
    two_column_items.append(
        Text.assemble(("created_at: ", "dim"), format_field_value(record.created_at))
    )
    # created_by / "name" in fk_data holds handle, is display name
    created_by_handle = (
        fk_data["created_by"]["name"] if fk_data else record.created_by.handle
    )
    two_column_items.append(Text.assemble(("created_by: ", "dim"), created_by_handle))


def add_two_column_items_to_tree(tree: Tree, two_column_items: list) -> None:
    table = Table(
        Column("", no_wrap=True),
        Column("", no_wrap=True),
        show_header=False,
        box=None,
        pad_edge=False,
    )
    for i in range(0, len(two_column_items), 2):
        if i + 1 < len(two_column_items):
            left_item = two_column_items[i]
            right_item = two_column_items[i + 1]
            table.add_row(left_item, right_item)
        else:
            table.add_row(two_column_items[i], "")
    tree.add(table)


def describe_artifact(
    record: Artifact,
    related_data: dict | None = None,
) -> Tree:
    from ._feature_manager import describe_features
    from ._label_manager import describe_labels

    if related_data is not None:
        fk_data = related_data.get("fk", {})
    else:
        fk_data = {}
    tree = describe_header(record)
    dataset_features_tree, external_features_tree = describe_features(
        record,
        related_data=related_data,
    )
    labels_tree = describe_labels(record, related_data=related_data)
    two_column_items = []  # type: ignore
    append_uid_run(record, two_column_items, fk_data)
    if record.kind or record.otype:
        two_column_items.append(Text.assemble(("kind: ", "dim"), f"{record.kind}"))
        two_column_items.append(Text.assemble(("otype: ", "dim"), f"{record.otype}"))
    two_column_items.append(Text.assemble(("hash: ", "dim"), f"{record.hash}"))
    two_column_items.append(
        Text.assemble(("size: ", "dim"), f"{format_bytes(record.size)}")
    )
    append_branch_space_created_at_created_by(record, two_column_items, fk_data)
    if record.n_observations:
        two_column_items.append(
            Text.assemble(("n_observations: ", "dim"), f"{record.n_observations}")
        )
    if record.n_files:
        two_column_items.append(
            Text.assemble(("n_files: ", "dim"), f"{record.n_files}")
        )
    schema_name = None
    if fk_data and "schema" in fk_data and fk_data["schema"]:
        schema_name = fk_data["schema"]["name"]
    elif record.schema_id is not None and record.schema is not None:
        schema_name = (
            record.schema.name
            if record.schema.name is not None
            else record.schema.uid[:7]
        )
    if schema_name is not None:
        two_column_items.append(Text.assemble(("schema: ", "dim"), schema_name))
    add_two_column_items_to_tree(tree, two_column_items)
    storage_root = fk_data["storage"]["name"] if fk_data else record.storage.root
    storage_key = (
        record.key
        if not record._key_is_virtual
        else record._real_key
        if record._real_key
        else f".lamindb/{record.uid}"
    )
    if record.uid in storage_key:
        if record.overwrite_versions:
            storage_key = storage_key[:-4]
        storage_key = f"{storage_key}{record.suffix}"
    tree.add(
        Text.assemble(
            ("storage/path: ", "dim"),
            (storage_root, "cyan3"),
            ("/", "dim"),
            storage_key,
        )
    )
    if dataset_features_tree:
        tree.add(dataset_features_tree)
    if external_features_tree:
        tree.add(external_features_tree)
    if labels_tree:
        tree.add(labels_tree)
    return tree


def describe_collection(
    record: Collection,
    related_data: dict | None = None,
) -> Tree:
    tree = describe_header(record)
    if related_data is not None:
        fk_data = related_data.get("fk", {})
    else:
        fk_data = {}
    two_column_items = []  # type: ignore
    append_uid_run(record, two_column_items, fk_data)
    append_branch_space_created_at_created_by(record, two_column_items, fk_data)
    add_two_column_items_to_tree(tree, two_column_items)
    return tree


def display_text(
    text: str, title: str, tree: Tree, max_lines: int = 30, uid: str = ""
) -> None:
    # Split the code into lines and add dim vertical bars
    lines = text.split("\n")
    end_parts = [("\n│ …", "grey30")] if len(lines) > max_lines else []
    parts = [(title + ": ", "purple")]
    parts.append((uid, ""))
    max_length = 80
    for line in lines[:max_lines]:
        parts.append(("\n│ ", "dim"))
        parts.append((line[:max_length], "grey30"))
        if len(line) > max_length:
            parts.append((" …", "grey30"))
    parts.extend(end_parts)
    tree.add(Text.assemble(*parts))


def describe_run(
    record: Run,
    related_data: dict | None = None,
) -> Tree:
    from ._feature_manager import describe_features

    tree = describe_header(record)
    if related_data is not None:
        fk_data = related_data.get("fk", {})
    else:
        fk_data = {}
    _, features_tree = describe_features(
        record,
        related_data=related_data,
    )
    two_column_items = []  # type: ignore
    two_column_items.append(Text.assemble(("uid: ", "dim"), f"{record.uid}"))
    if fk_data and "transform" in fk_data:
        transform = SimpleNamespace(**fk_data["transform"], description="")
    else:
        transform = record.transform
    transform_key = transform.key if transform and transform.key is not None else ""
    transform_version = (
        f" ({transform.version})" if transform and transform.version is not None else ""
    )
    two_column_items.append(
        Text.assemble(
            ("transform: ", "dim"),
            (transform_key, "cyan3"),
            (transform_version, "dim"),
        )
    )
    two_column_items.append(
        Text.assemble(
            ("started_at: ", "dim"), format_field_value(record.started_at, none="")
        )
    )
    two_column_items.append(
        Text.assemble(
            ("finished_at: ", "dim"), format_field_value(record.finished_at, none="")
        )
    )
    two_column_items.append(Text.assemble(("status: ", "dim"), record.status))
    two_column_items.append(
        Text.assemble(("reference: ", "dim"), record.reference)
        if record.reference
        else Text("")
    )
    append_branch_space_created_at_created_by(record, two_column_items, fk_data)
    add_two_column_items_to_tree(tree, two_column_items)
    if record.cli_args:
        display_text(
            record.cli_args.strip(),
            "cli_args",
            tree,
            max_lines=4,
        )
    if record.report_id:
        report = record.report.load(is_run_input=False)
        if report:
            report_str = report if isinstance(report, str) else str(report)
            display_text(
                strip_ansi_from_string(report_str.strip()),
                "report",
                tree,
                max_lines=4,
                uid=record.report.uid[:7],
            )
    if record.environment_id:
        env_result = record.environment.load(is_run_input=False)
        env_str = env_result if isinstance(env_result, str) else str(env_result)
        display_text(
            env_str.strip(),
            "environment",
            tree,
            max_lines=4,
            uid=record.environment.uid[:7],
        )
    if record.params:
        params = tree.add(Text("Params", style="bold dark_orange"))
        for key, value in record.params.items():
            params.add(f"{key}: {value}")
    if features_tree:
        tree.add(features_tree)
    return tree


def describe_record(
    record: Record,
    related_data: dict | None = None,
) -> Tree:
    from ._feature_manager import describe_features

    tree = describe_header(record)
    if related_data is not None:
        fk_data = related_data.get("fk", {})
    else:
        fk_data = {}
    _, features_tree = describe_features(
        record,
        related_data=related_data,
    )
    two_column_items = []  # type: ignore
    append_uid_run(record, two_column_items, fk_data)
    type_name = (
        fk_data["type"]["name"]
        if fk_data and "type" in fk_data and fk_data["type"]
        else record.type.name
        if record.type_id is not None
        else ""
    )
    if type_name is None:
        type_name = ""
    two_column_items.append(Text.assemble(("type: ", "dim"), type_name))
    two_column_items.append(Text.assemble(("is_type: ", "dim"), f"{record.is_type}"))
    schema_name = (
        fk_data["schema"]["name"]
        if fk_data and "schema" in fk_data and fk_data["schema"]
        else record.schema.name
        if record.schema_id is not None
        else ""
    )
    if schema_name is None:
        schema_name = ""
    two_column_items.append(Text.assemble(("schema: ", "dim"), schema_name))
    reference = record.reference if record.reference is not None else ""
    two_column_items.append(Text.assemble(("reference: ", "dim"), reference))
    append_branch_space_created_at_created_by(record, two_column_items, fk_data)
    add_two_column_items_to_tree(tree, two_column_items)
    if features_tree:
        tree.add(features_tree)
    return tree


def describe_transform(
    record: Transform,
    related_data: dict | None = None,
) -> Tree:
    tree = describe_header(record)
    if related_data is not None:
        fk_data = related_data.get("fk", {})
    else:
        fk_data = {}
    two_column_items = []  # type: ignore
    two_column_items.append(Text.assemble(("uid: ", "dim"), f"{record.uid}"))
    two_column_items.append(
        Text.assemble(("reference: ", "dim"), record.reference)
        if record.reference
        else Text("")
    )
    two_column_items.append(Text.assemble(("hash: ", "dim"), f"{record.hash}"))
    two_column_items.append(Text.assemble(("type: ", "dim"), f"{record.type}"))
    append_branch_space_created_at_created_by(record, two_column_items, fk_data)
    add_two_column_items_to_tree(tree, two_column_items)
    if record.source_code:
        display_text(record.source_code.strip(), "source_code", tree)
    return tree


def describe_branch(record: Branch) -> Tree:
    tree = describe_header(record)
    two_column_items = []  # type: ignore
    two_column_items.append(Text.assemble(("status: ", "dim"), record.status))
    two_column_items.append(Text.assemble(("space: ", "dim"), record.space.name))
    two_column_items.append(
        Text.assemble(("created_at: ", "dim"), format_field_value(record.created_at))
    )
    two_column_items.append(
        Text.assemble(("created_by: ", "dim"), record.created_by.handle)
    )
    add_two_column_items_to_tree(tree, two_column_items)
    return tree


def describe_schema(record: Schema, slot: str | None = None) -> Tree:
    from ._feature_manager import format_dtype_for_display, strip_cat

    if record.type:
        prefix = f" {record.type.name} · "
    else:
        prefix = " "
    if record.name:
        name = record.name
    else:
        name = "unnamed"
    header = "Schema:" if slot is None else f"{slot}:"
    description = (
        Text.assemble(("\n|   description: ", "dim"), record.description)
        if record.description
        else Text("")
    )
    tree = Tree(
        Text.assemble(
            (header, "bold"), (f"{prefix}", "dim"), (f"{name}", "cyan3"), description
        ),
        guide_style="dim",
    )
    two_column_items = []  # type: ignore
    append_uid_run(record, two_column_items)
    two_column_items.append(Text.assemble(("itype: ", "dim"), f"{record.itype}"))
    two_column_items.append(Text.assemble(("otype: ", "dim"), f"{record.otype}"))
    two_column_items.append(Text.assemble(("hash: ", "dim"), f"{record.hash}"))
    two_column_items.append(
        Text.assemble(("ordered_set: ", "dim"), f"{record.ordered_set}")
    )
    two_column_items.append(
        Text.assemble(("maximal_set: ", "dim"), f"{record.maximal_set}")
    )
    two_column_items.append(
        Text.assemble(("minimal_set: ", "dim"), f"{record.minimal_set}")
    )
    append_branch_space_created_at_created_by(record, two_column_items)
    add_two_column_items_to_tree(tree, two_column_items)

    # Add features section
    n_members = record.n_members
    members_count_display = f" ({n_members})" if n_members else ""
    if n_members or (record.dtype and record.itype is not None):
        features = tree.add(
            Text.assemble(
                (
                    "Features" if record.itype == "Feature" else record.itype,
                    "bold bright_magenta",
                ),
                (members_count_display, "bold dim"),
            )
        )
        if n_members is not None:
            feature_table = Table(
                show_header=True, header_style="dim", box=None, pad_edge=False
            )

            feature_table.add_column("name", style="", no_wrap=True)
            feature_table.add_column("dtype", style="", no_wrap=True)
            feature_table.add_column("optional", style="", no_wrap=True)
            feature_table.add_column("nullable", style="", no_wrap=True)
            feature_table.add_column("coerce", style="", no_wrap=True)
            feature_table.add_column("default_value", style="", no_wrap=True)

            optionals = record.optionals.get()
            for member in record.members:
                feature_table.add_row(
                    Text(member.name),
                    Text(strip_cat(format_dtype_for_display(member._dtype_str))),
                    "✓" if optionals.filter(uid=member.uid).exists() else "✗",
                    "✓" if member.nullable else "✗",
                    "✓" if record.coerce or member.coerce else "✗",
                    str(member.default_value) if member.default_value else "unset",
                )

            features.add(feature_table)
        elif record.dtype:
            features.add(Text.assemble(("dtype: ", "dim"), f"{record.dtype}"))

    return tree


def describe_postgres(record):
    from ._django import get_artifact_or_run_with_related, get_collection_with_related

    model_name = record.__class__.__name__
    msg = f"{colors.green(model_name)}{record.__repr__(include_foreign_keys=False).lstrip(model_name)}\n"
    if record._state.db is not None and record._state.db != "default":
        msg += f"  {colors.italic('Database instance')}\n"
        msg += f"    slug: {record._state.db}\n"
    if model_name in {"Artifact", "Run"}:
        result = get_artifact_or_run_with_related(
            record,
            include_feature_link=True,
            include_fk=True,
            include_m2m=True,
            include_schema=True,
        )
        related_data = result.get("related_data", {})
        if model_name == "Artifact":
            tree = describe_artifact(record, related_data=related_data)
        else:
            tree = describe_run(record, related_data=related_data)
    elif model_name == "Record":
        result = get_artifact_or_run_with_related(
            record,
            include_feature_link=True,
            include_fk=True,
        )
        related_data = result.get("related_data", {})
        tree = describe_record(record, related_data=related_data)
    elif model_name == "Collection":
        result = get_collection_with_related(record, include_fk=True)
        related_data = result.get("related_data", {})
        tree = describe_collection(record, related_data=related_data)
    elif model_name == "Transform":
        tree = describe_transform(record)
    elif model_name == "Branch":
        tree = describe_branch(record)
    else:
        tree = describe_header(record)
    return tree


def describe_sqlite(record):
    model_name = record.__class__.__name__
    msg = f"{colors.green(model_name)}{record.__repr__(include_foreign_keys=False).lstrip(model_name)}\n"
    if record._state.db is not None and record._state.db != "default":
        msg += f"  {colors.italic('Database instance')}\n"
        msg += f"    slug: {record._state.db}\n"

    fields = record._meta.fields
    direct_fields = []
    foreign_key_fields = []
    for f in fields:
        if f.is_relation:
            foreign_key_fields.append(f.name)
        else:
            direct_fields.append(f.name)
    if not record._state.adding:
        # prefetch foreign key relationships
        record = (
            record.__class__.objects.using(record._state.db)
            .select_related(*foreign_key_fields)
            .get(id=record.id)
        )
        # prefetch m-2-m relationships
        many_to_many_fields = []
        if model_name in {"Artifact", "Collection"}:
            many_to_many_fields.append("input_of_runs")
        if model_name == "Artifact":
            many_to_many_fields.append("schemas")
        record = (
            record.__class__.objects.using(record._state.db)
            .prefetch_related(*many_to_many_fields)
            .get(id=record.id)
        )
    if model_name in {"Artifact", "Run", "Record"}:
        if model_name == "Artifact":
            tree = describe_artifact(record)
        elif model_name == "Run":
            tree = describe_run(record)
        else:
            tree = describe_record(record)
    elif model_name == "Collection":
        tree = describe_collection(record)
    elif model_name == "Transform":
        tree = describe_transform(record)
    elif model_name == "Branch":
        tree = describe_branch(record)
    else:
        tree = describe_header(record)
    return tree


def append_readme_blocks_to_tree(
    record, tree: Tree, include: None | Literal["comments"] = None
) -> None:
    """Append readme (and optionally comment) block content to the describe tree."""
    if record._state.adding:
        return
    if not hasattr(record, "ablocks"):
        return
    if include == "comments":
        blocks_qs = record.ablocks.filter(
            Q(kind="readme", is_latest=True) | Q(kind="comment")
        ).select_related("created_by")
    else:
        blocks_qs = record.ablocks.filter(kind="readme", is_latest=True)
    blocks = list(blocks_qs.order_by("created_at"))
    # README first, then comments; each group sorted chronologically
    readme_blocks = [b for b in blocks if b.kind == "readme"]
    comment_blocks = [b for b in blocks if b.kind == "comment"]
    for block in readme_blocks + comment_blocks:
        if block.kind == "readme":
            title = "README"
        else:
            handle = block.created_by.handle if block.created_by else "?"
            created_at_str = format_field_value(block.created_at)
            title = f"comment by {handle} at {created_at_str}"
        display_text(
            block.content,
            title,
            tree,
            max_lines=30,
            uid="",
        )


def describe_postgres_sqlite(
    record,
    return_str: bool = False,
    include: None | Literal["comments"] = None,
) -> str | None:
    from ._describe import format_rich_tree

    if (
        not record._state.adding
        and connections[record._state.db].vendor == "postgresql"
    ):
        tree = describe_postgres(record)
    else:
        tree = describe_sqlite(record)
    append_readme_blocks_to_tree(record, tree, include=include)
    return format_rich_tree(tree, return_str=return_str)


================================================
FILE: lamindb/models/_django.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Any

from django.contrib.postgres.aggregates import ArrayAgg
from django.db import connection
from django.db.models import CharField, F, OuterRef, Q, Subquery
from django.db.models.fields.related import ForeignKey, ManyToManyField
from django.db.models.fields.reverse_related import ManyToManyRel, ManyToOneRel
from django.db.models.functions import JSONObject

from ._relations import dict_related_model_to_related_name, get_schema_modules
from .schema import Schema

if TYPE_CHECKING:
    from .artifact import Artifact, Collection
    from .record import Record
    from .run import Run


def patch_many_to_many_descriptor() -> None:
    """Patches Django's `ManyToManyDescriptor.__get__` method to suggest better errors when saving relationships of an unsaved model.

    Before this patch: Cryptic errors are raised when relationships of an unsaved record are attempted to be modified.

    After this patch: Attempts to access M2M relationships on unsaved objects will raise ValueError,
    suggesting explicit .save() of the record to be modified before relationship creation.
    """
    from django.db.models.fields.related_descriptors import ManyToManyDescriptor

    original_get = ManyToManyDescriptor.__get__

    def patched_get(self, instance, cls=None):
        if instance is not None and instance.pk is None:
            raise ValueError(
                f"You are trying to access the many-to-many relationships of an unsaved {instance.__class__.__name__} object. "
                f"Please save it first using '.save()'."
            )

        manager = original_get(self, instance, cls)
        if manager is None or not hasattr(manager, "add"):
            return manager

        original_manager_add = manager.add

        def patched_manager_add(*objs, **kwargs):
            try:
                return original_manager_add(*objs, **kwargs)
            except ValueError as e:
                if "Cannot add" in str(e) and "database" in str(e):
                    source_db = manager.instance._state.db

                    raise ValueError(
                        f"Cannot label a record from instance '{source_db}'. "
                        f"Please save the record first to your instance using '.save()'."
                    ) from None
                raise

        manager.add = patched_manager_add
        return manager

    ManyToManyDescriptor.__get__ = patched_get


def get_related_model(model, field_name):
    try:
        field = model._meta.get_field(field_name)

        if isinstance(field, (ForeignKey, ManyToManyField)):
            # Forward ForeignKey or ManyToManyField
            return field.remote_field.model
        elif isinstance(field, (ManyToOneRel, ManyToManyRel)):
            # Reverse ForeignKey or ManyToManyField
            return field.related_model
        else:
            return f"Unexpected field type: {type(field)}"
    except Exception as e:
        return f"Error: {str(e)}"


def get_artifact_or_run_with_related(
    record: Artifact | Run | Record,
    include_fk: bool = False,
    include_m2m: bool = False,
    include_feature_link: bool = False,
    include_schema: bool = False,
) -> dict[str, Any]:
    """Fetch an artifact with its related data."""
    from ._label_manager import EXCLUDE_LABELS
    from .can_curate import get_name_field
    from .query_set import get_default_branch_ids

    model = record.__class__
    is_record = record.__class__.__name__ == "Record"
    is_artifact = record.__class__.__name__ == "Artifact"
    entity_field_name = record.__class__.__name__.lower()
    if entity_field_name in {"run", "record"} and include_schema:
        include_schema = False  # runs do not have feature sets
    schema_modules = get_schema_modules(record._state.db)

    foreign_key_fields = [
        f.name
        for f in model._meta.fields
        if f.is_relation and f.related_model.__get_module_name__() in schema_modules
    ]

    # Create the map that the conversion function will need.
    # It maps the target model class to the m2m field name, e.g.,
    # {'Ulabel': 'ulabels', 'CellType': 'cell_types'}
    m2m_model_to_field_map = {}
    if include_m2m:
        full_map = dict_related_model_to_related_name(model, instance=record._state.db)
        m2m_model_to_field_map = {
            model_cls: field_name
            for model_cls, field_name in full_map.items()
            if not field_name.startswith("_") and field_name not in EXCLUDE_LABELS
        }
        if is_record:
            m2m_model_to_field_map["Run"] = "linked_runs"
        else:
            m2m_model_to_field_map["Run"] = "runs"
    link_tables = (
        []
        if not include_feature_link
        else list(
            dict_related_model_to_related_name(
                model, links=True, instance=record._state.db
            ).values()
        )
    )

    # Clear previous queries
    connection.queries_log.clear()

    annotations = {}

    if include_fk:
        for fk in foreign_key_fields:
            name_field = get_name_field(get_related_model(model, fk))
            if fk == "run":
                annotations[f"fkfield_{fk}"] = JSONObject(
                    id=F(f"{fk}__id"),
                    name=F(f"{fk}__name"),
                    uid=F(f"{fk}__uid"),
                    transform_key=F(f"{fk}__transform__key"),
                )
            elif fk == "transform":
                annotations[f"fkfield_{fk}"] = JSONObject(
                    id=F(f"{fk}__id"),
                    key=F(f"{fk}__key"),
                    uid=F(f"{fk}__uid"),
                    version=F(f"{fk}__version_tag"),
                )
            elif fk == "created_by":
                annotations[f"fkfield_{fk}"] = JSONObject(
                    id=F(f"{fk}__id"), name=F(f"{fk}__{name_field}")
                )
            else:
                annotations[f"fkfield_{fk}"] = JSONObject(
                    id=F(f"{fk}__id"), name=F(f"{fk}__{name_field}")
                )

    for link in link_tables:
        link_model = getattr(model, link).rel.related_model
        if not hasattr(link_model, "feature"):
            continue
        if not is_record and link_model.__name__ in {
            "RecordArtifact",
            "RecordRun",
        }:
            continue
        if is_record and (
            not link_model.__name__.startswith("Record")
            or link_model.__name__
            in {
                "RecordJson",
            }
        ):
            continue
        if not is_record and not link_model.__name__ == "ArtifactArtifact":
            if link_model.__name__ == "RunArtifact":
                if is_artifact:
                    continue
                else:
                    label_field = "artifact"
            else:
                label_field = link.removeprefix("links_").replace("_", "")
        else:
            label_field = "value"
        related_model = link_model._meta.get_field(label_field).related_model
        # manually include "name" as pertdb.Compound.name is a TextField due to no length limitation
        char_field_names = [
            field.name
            for field in related_model._meta.concrete_fields
            if isinstance(field, CharField) or field.name == "name"
        ]
        name_field = get_name_field(related_model)
        label_field_name = f"{label_field}__{name_field}"
        filter_kwargs = {entity_field_name: OuterRef("pk")}
        if link_model.__name__ not in {
            "RecordUser",
            "ArtifactUser",
        }:  # user does not have branch
            filter_kwargs[f"{label_field}__branch_id__in"] = get_default_branch_ids()
        annotations[f"linkfield_{link}"] = Subquery(
            link_model.objects.filter(**filter_kwargs)
            .annotate(
                data=JSONObject(
                    id=F("id"),
                    feature=F("feature"),
                    **{label_field: F(label_field)},
                    **{
                        label_field + "_display": F(label_field_name)
                    },  # display field is the name field
                    **{uf: F(f"{label_field}__{uf}") for uf in char_field_names},
                )
            )
            .values(entity_field_name)
            .annotate(json_agg=ArrayAgg("data"))
            .values("json_agg")
        )

    if include_schema:
        annotations["m2m_schemas"] = Subquery(
            model.schemas.through.objects.filter(artifact=OuterRef("pk"))
            .annotate(
                data=JSONObject(
                    id=F("id"),
                    slot=F("slot"),
                    schema=F("schema"),
                )
            )
            .values(entity_field_name)
            .annotate(json_agg=ArrayAgg("data"))
            .values("json_agg")
        )

    record_meta = (
        model.objects.using(record._state.db)
        .filter(uid=record.uid)
        .annotate(**annotations)
        .values(*["id", "uid"], *annotations.keys())
        .first()
    )

    if not record_meta:
        return None

    related_data: dict = {"m2m": {}, "fk": {}, "link": {}, "m2m_schemas": {}}
    for k, v in record_meta.items():
        if k.startswith("fkfield_") and v is not None:
            related_data["fk"][k[8:]] = v
        elif k.startswith("linkfield_") and v is not None:
            related_data["link"][k[10:]] = v
        elif k == "m2m_schemas":
            if v:
                related_data["m2m_schemas"] = get_schema_m2m_relations(
                    record, {i["schema"]: i["slot"] for i in v}
                )

    def convert_link_data_to_m2m(
        link_data: dict,
        model,  # The main artifact model class is still needed for introspection
        m2m_model_map: dict,  # The pre-computed map from Step 1
    ) -> dict:
        """Converts link data to M2M-style data using a pre-computed model-to-field-name map."""
        # link_data: {'links_tissue': [{'id': 1, 'uid': '1fIFAQJY', 'abbr': None, 'name': 'brain', 'tissue': 1, 'feature': 1, 'ontology_id': 'UBERON:0000955', 'tissue_display': 'brain'}, {'id': 2, 'uid': '7Tt4iEKc', 'abbr': None, 'name': 'lung', 'tissue': 10, 'feature': 1, 'ontology_id': 'UBERON:0002048', 'tissue_display': 'lung'}], 'links_cell_type': [{'id': 1, 'uid': '3QnZfoBk', 'abbr': None, 'name': 'neuron', 'feature': 2, 'celltype': 1, 'ontology_id': 'CL:0000540', 'celltype_display': 'neuron'}]}
        m2m_data = {}
        for link_name, records in link_data.items():
            if not records:
                continue
            link_model = getattr(model, link_name).rel.related_model
            if not is_record:
                id_field_name = link_name.removeprefix("links_").replace("_", "")
            else:
                id_field_name = "value"
            final_target_model = link_model._meta.get_field(id_field_name).related_model
            m2m_field_name = m2m_model_map.get(
                final_target_model.__get_name_with_module__()
            )
            m2m_data[m2m_field_name] = {
                record[id_field_name]: record for record in records
            }
        return m2m_data

    related_data["m2m"] = convert_link_data_to_m2m(
        related_data["link"], model=model, m2m_model_map=m2m_model_to_field_map
    )
    return {
        **{name: record_meta[name] for name in ["id", "uid"]},
        "related_data": related_data,
    }


def get_collection_with_related(
    collection: Collection,
    include_fk: bool = False,
) -> dict[str, Any]:
    """Fetch a collection with its related data."""
    from .can_curate import get_name_field

    model = collection.__class__
    schema_modules = get_schema_modules(collection._state.db)

    foreign_key_fields = [
        f.name
        for f in model._meta.fields
        if f.is_relation and f.related_model.__get_module_name__() in schema_modules
    ]

    # Clear previous queries
    connection.queries_log.clear()

    annotations = {}

    if include_fk:
        for fk in foreign_key_fields:
            name_field = get_name_field(get_related_model(model, fk))
            if fk == "run":
                annotations[f"fkfield_{fk}"] = JSONObject(
                    id=F(f"{fk}__id"),
                    name=F(f"{fk}__{name_field}"),
                    transform_key=F(f"{fk}__transform__key"),
                )
            else:
                annotations[f"fkfield_{fk}"] = JSONObject(
                    id=F(f"{fk}__id"), name=F(f"{fk}__{name_field}")
                )

    collection_meta = (
        model.objects.using(collection._state.db)
        .filter(uid=collection.uid)
        .annotate(**annotations)
        .values(*["id", "uid"], *annotations.keys())
        .first()
    )

    if not collection_meta:
        return None

    related_data: dict = {"fk": {}}
    for k, v in collection_meta.items():
        if k.startswith("fkfield_") and v is not None:
            related_data["fk"][k[8:]] = v

    return {
        **{name: collection_meta[name] for name in ["id", "uid"]},
        "related_data": related_data,
    }


def get_schema_m2m_relations(artifact: Artifact, slot_schema: dict, limit: int = 20):
    """Fetch all many-to-many relationships for given feature sets."""
    from .can_curate import get_name_field

    m2m_relations = [
        v
        for v in dict_related_model_to_related_name(Schema).values()
        if v is not None and not v.startswith("_") and v != "artifacts"
    ]

    annotations = {}
    related_names = {}
    for name in m2m_relations:
        related_model = get_related_model(Schema, name)
        if related_model is Schema:
            # this is for the `type` field
            continue
        name_field = get_name_field(related_model)

        # Get the correct field names for the through table
        if not hasattr(getattr(Schema, name), "through"):
            continue
        through_model = getattr(Schema, name).through

        # Subquery to get limited related records
        limited_related = Subquery(
            through_model.objects.filter(schema=OuterRef("pk")).values(
                related_model.__name__.lower()
            )[:limit]
        )

        annotations[f"m2mfield_{name}"] = ArrayAgg(
            JSONObject(id=F(f"{name}__id"), name=F(f"{name}__{name_field}")),
            filter=Q(
                **{
                    f"{name}__id__in": limited_related,
                }
            ),
            distinct=True,
        )
        related_names[name] = related_model.__get_name_with_module__()

    schema_m2m = (
        Schema.connect(artifact._state.db)
        .filter(id__in=slot_schema.keys())
        .annotate(**annotations)
        .values("id", *annotations.keys())
    )

    result = {}
    for fs in schema_m2m:
        slot = slot_schema.get(fs["id"])
        result[fs["id"]] = (
            slot,
            {
                related_names.get(k[9:]): [item["name"] for item in v]
                for k, v in fs.items()
                if k.startswith("m2mfield_") and v
            },
        )

    return result


patch_many_to_many_descriptor()


================================================
FILE: lamindb/models/_feature_manager.py
================================================
# ruff: noqa: TC004
from __future__ import annotations

from collections import defaultdict
from collections.abc import Iterable
from datetime import date, datetime
from itertools import compress
from pathlib import Path
from typing import TYPE_CHECKING, Any

import numpy as np
from django.contrib.postgres.aggregates import ArrayAgg
from django.db import connections
from django.db.models import Aggregate, Subquery
from django.db.models.expressions import RawSQL
from django.db.utils import IntegrityError
from lamin_utils import logger
from lamindb_setup.core.upath import UPath
from lamindb_setup.errors import ModuleWasntConfigured
from rich.table import Column, Table
from rich.text import Text
from rich.tree import Tree

from lamindb.errors import DoesNotExist, InvalidArgument, ValidationError
from lamindb.models._from_values import _format_values
from lamindb.models.feature import (
    serialize_pandas_dtype,
    suggest_categorical_for_str_iterable,
)
from lamindb.models.has_parents import keep_topmost_matches
from lamindb.models.save import save
from lamindb.models.schema import DICT_KEYS_TYPE, Schema
from lamindb.models.sqlrecord import (
    REGISTRY_UNIQUE_FIELD,
    get_name_field,
    transfer_fk_to_default_db_bulk,
    transfer_to_default_db,
)

from ._describe import (
    NAME_WIDTH,
    TYPE_WIDTH,
    VALUES_WIDTH,
    describe_header,
    format_rich_tree,
)
from ._django import get_artifact_or_run_with_related
from ._label_manager import _get_labels
from ._relations import (
    dict_related_model_to_related_name,
)
from .feature import Feature, FeaturePredicate, JsonValue, parse_dtype
from .sqlrecord import SQLRecord
from .ulabel import ULabel

if TYPE_CHECKING:
    from rich.tree import Tree

    from lamindb.base.types import FieldAttr
    from lamindb.models import (
        Artifact,
        Collection,
        IsLink,
    )
    from lamindb.models.query_set import BasicQuerySet, SQLRecordList

    from ..base.types import DtypeObject
    from .record import Record
    from .run import Run


def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
    dictionary = {
        field.related_model.__get_name_with_module__(): field.name
        for field in host._meta.related_objects
    }
    dictionary["Feature"] = "features"
    dictionary["ULabel"] = "ulabels"
    dictionary["Record"] = "records"
    return dictionary


def get_schema_by_slot_(host: Artifact) -> dict[str, Schema]:
    # if the host is not yet saved
    if host._state.adding:
        if hasattr(host, "_staged_schemas"):
            return host._staged_schemas
        else:
            return {}
    host_db = host._state.db
    kwargs = {"artifact_id": host.id}
    # otherwise, we need a query
    links_schema = (
        host.schemas.through.objects.using(host_db)
        .filter(**kwargs)
        .select_related("schema")
    )
    return {fsl.slot: fsl.schema for fsl in links_schema}


def get_label_links(
    host: Artifact | Collection, registry: str, feature: Feature
) -> BasicQuerySet:
    kwargs = {"artifact_id": host.id, "feature_id": feature.id}
    link_records = (
        getattr(host, host.features._accessor_by_registry[registry])  # type: ignore
        .through.objects.using(host._state.db)
        .filter(**kwargs)
    )
    return link_records


def get_schema_links(host: Artifact | Collection) -> BasicQuerySet:
    kwargs = {"artifact_id": host.id}
    links_schema = host.schemas.through.objects.filter(**kwargs)
    return links_schema


def get_link_attr(
    link: IsLink | type[IsLink],
    data: Artifact | Collection | Run | type,
) -> str:
    link_model_name = link.__class__.__name__
    if link_model_name in {"Registry", "ModelBase"}:  # we passed the type of the link
        link_model_name = link.__name__  # type: ignore
    if link_model_name.startswith("Record") or link_model_name == "ArtifactArtifact":
        return "value"
    host_name = data.__name__ if isinstance(data, type) else data.__class__.__name__
    return link_model_name.replace(host_name, "").lower()


def get_categorical_link_info(
    host_class: type[SQLRecord],
    label_registry: type[SQLRecord],
    instance: str | None = None,
) -> tuple[type[SQLRecord], str, str]:
    """Resolve (link_model, value_field_name, filter_accessor_name) for (host_class, label_registry).

    Used by filter_base (categorical path) and _add_label_feature_links.
    """
    host_name = host_class.__name__.lower()

    if host_name == "record":
        d = dict_related_model_to_related_name(
            host_class, links=True, instance=instance
        )
        for rel in host_class._meta.related_objects:
            link_model = rel.related_model
            key = link_model.__get_name_with_module__()
            if key not in d:
                continue
            if not hasattr(link_model, "feature_id") or not hasattr(
                link_model, "value"
            ):
                continue
            value_fk = link_model._meta.get_field("value")
            if (
                value_fk.remote_field is None
                or value_fk.remote_field.model != label_registry
            ):
                continue
            accessor = d[key]
            return (link_model, "value", accessor)
        raise ValueError(
            f"No categorical link model for Record + {label_registry.__name__}. "
            "Ensure the label registry has a Record* link model (e.g. RecordRecord, RecordULabel) "
            "or a bionty link model (e.g. RecordCellLine) in loaded schema modules."
        )

    # Artifact, Run, or Collection
    attr_map = {
        "artifact": "artifacts",
        "run": "runs",
        "collection": "collections",
    }
    attr = attr_map.get(host_name)
    if not attr or not hasattr(label_registry, attr):
        raise ValueError(
            f"{label_registry.__name__} has no {attr or host_name!r} relation; "
            "cannot resolve categorical link for this host."
        )
    through = getattr(label_registry, attr).through
    link_model = through
    host_fk = host_name  # "artifact", "run", "collection"
    value_field = get_link_attr(link_model, host_class)
    filter_accessor = getattr(link_model, host_fk).field._related_name
    return (link_model, value_field, filter_accessor)


def strip_cat(feature_dtype: str) -> str:
    if "cat[" in feature_dtype:
        parts = feature_dtype.split("cat[")
        dtype_stripped_cat = "".join(
            part[:-1] if i != 0 else part for i, part in enumerate(parts)
        )
    else:
        dtype_stripped_cat = feature_dtype
    return dtype_stripped_cat


def format_dtype_for_display(dtype_str: str) -> str:
    """Format dtype string for display, replacing Record[uid] or ULabel[uid] with Record[TypeName] or ULabel[TypeName]."""
    from .feature import parse_dtype
    from .record import Record
    from .ulabel import ULabel

    # Check if this is a Record[uid] or ULabel[uid] format
    if ("Record[" in dtype_str or "ULabel[" in dtype_str) and "]" in dtype_str:
        try:
            parsed = parse_dtype(dtype_str)
            if parsed and parsed[0].get("record_uid"):
                record_uid = parsed[0]["record_uid"]
                registry_str = parsed[0].get("registry_str", "")
                try:
                    # Determine which registry to use
                    if registry_str == "Record":
                        record_type = Record.get(uid=record_uid)
                        # Replace Record[uid] with Record[TypeName]
                        dtype_str = dtype_str.replace(
                            f"Record[{record_uid}]", f"Record[{record_type.name}]"
                        )
                    elif registry_str == "ULabel":
                        record_type = ULabel.get(uid=record_uid)
                        # Replace ULabel[uid] with ULabel[TypeName]
                        dtype_str = dtype_str.replace(
                            f"ULabel[{record_uid}]", f"ULabel[{record_type.name}]"
                        )
                except Exception as e:
                    # If we can't find the record, just return the original
                    logger.debug(
                        f"Could not find {registry_str} with uid '{record_uid}' for display formatting: {e}"
                    )
        except Exception as e:
            # If parsing fails, return the original
            logger.debug(
                f"Could not parse dtype string '{dtype_str}' for display formatting: {e}"
            )
    return dtype_str


# Custom aggregation for SQLite
class GroupConcat(Aggregate):
    function = "GROUP_CONCAT"
    template = '%(function)s(%(expressions)s, ", ")'


def custom_aggregate(field, using: str):
    if connections[using].vendor == "postgresql":
        return ArrayAgg(field)
    else:
        return GroupConcat(field)


def get_categoricals_postgres(
    self: Artifact | Collection | Run,
    related_data: dict | None = None,
) -> dict[tuple[str, str], set[str]]:
    """Get categorical features and their values using PostgreSQL-specific optimizations."""
    if related_data is None:
        if self.__class__.__name__ in {"Artifact", "Run", "Record"}:
            artifact_meta = get_artifact_or_run_with_related(
                self, include_feature_link=True, include_m2m=True
            )
            related_data = artifact_meta.get("related_data", {})
        else:
            related_data = {}

    # Process m2m data
    m2m_data = related_data.get("m2m", {}) if related_data else {}
    # e.g. m2m_data = {'tissues': {1: {'id': 1, 'uid': '1fIFAQJY', 'abbr': None, 'name': 'brain', 'tissue': 1, 'feature': 1, 'ontology_id': 'UBERON:0000955', 'tissue_display': 'brain'}, 10: {'id': 2, 'uid': '7Tt4iEKc', 'abbr': None, 'name': 'lung', 'tissue': 10, 'feature': 1, 'ontology_id': 'UBERON:0002048', 'tissue_display': 'lung'}}, 'cell_types': {1: {'id': 1, 'uid': '3QnZfoBk', 'abbr': None, 'name': 'neuron', 'feature': 2, 'celltype': 1, 'ontology_id': 'CL:0000540', 'celltype_display': 'neuron'}}}
    # e.g. {'tissue': {1: {'id': 1, 'uid': '1fIFAQJY', 'abbr': None, 'name': 'brain', 'tissue': 1, 'feature': 1, 'ontology_id': 'UBERON:0000955', 'tissue_display': 'brain'}, 10: {'id': 2, 'uid': '7Tt4iEKc', 'abbr': None, 'name': 'lung', 'tissue': 10, 'feature': 1, 'ontology_id': 'UBERON:0002048', 'tissue_display': 'lung'}}, 'celltype': {1: {'id': 1, 'uid': '3QnZfoBk', 'abbr': None, 'name': 'neuron', 'feature': 2, 'celltype': 1, 'ontology_id': 'CL:0000540', 'celltype_display': 'neuron'}}}
    # integers are the ids of the related labels
    m2m_name = {}
    if not self.__class__.__name__ == "Record":
        for related_name, values in m2m_data.items():
            link_model = getattr(self.__class__, related_name).through
            related_model_name = link_model.__name__.replace(
                self.__class__.__name__, "", 1
            ).lower()
            if related_model_name == "artifact":
                related_model_name = "value"
            m2m_name[related_model_name] = values
    else:
        m2m_name = related_data.get("m2m", {})

    # Get feature information
    links_data = related_data.get("link", {}) if related_data else {}
    # e.g. feature_dict = {1: ('tissue', 'cat[bionty.Tissue.ontology_id]'), 2: ('cell_type', 'cat[bionty.CellType]')}
    feature_dict = {
        id: (name, dtype)
        for id, name, dtype in Feature.connect(self._state.db).values_list(
            "id", "name", "_dtype_str"
        )
    }

    # Build result dictionary
    result = {}  # type: ignore
    for link_name, link_values in links_data.items():
        related_name = link_name.removeprefix("links_").replace("_", "")
        if not link_values:
            continue
        # sort by the order on the link table, important for list dtypes
        for link_value in sorted(link_values, key=lambda x: x.get("id")):
            feature_id = link_value.get("feature")
            if feature_id is None:
                continue
            feature_name, feature_dtype = feature_dict.get(feature_id)
            feature_field = parse_dtype(feature_dtype)[0]["field_str"]
            if not self.__class__.__name__ == "Record":
                label_id = link_value.get(related_name)
                label_name = (
                    m2m_name.get(related_name, {}).get(label_id, {}).get(feature_field)
                )
            else:
                label_name = link_value.get(feature_field)
            if label_name:
                dict_key = (feature_name, feature_dtype)
                if dict_key not in result:
                    result[dict_key] = (
                        set() if not feature_dtype.startswith("list[cat") else []
                    )
                if feature_dtype.startswith("list[cat"):
                    result[dict_key].append(label_name)
                else:
                    result[dict_key].add(label_name)
    return dict(result)


def get_categoricals_sqlite(
    self: Artifact | Collection,
) -> dict[tuple[str, str], set[str]]:
    """Get categorical features and their values using the default approach."""
    from .query_set import get_default_branch_ids

    result = {}  # type: ignore
    for _, links in _get_labels(self, links=True, instance=self._state.db).items():
        for link in links:
            if link.__class__.__name__ == "RecordJson":
                continue
            if hasattr(link, "feature_id") and link.feature_id is not None:
                feature = Feature.objects.using(self._state.db).get(id=link.feature_id)
                dtype_str = feature._dtype_str
                feature_field = parse_dtype(dtype_str)[0]["field_str"]
                link_attr = get_link_attr(link, self)
                label = getattr(link, link_attr)
                if hasattr(label, "branch_id"):
                    if label.branch_id not in get_default_branch_ids():
                        continue
                label_name = getattr(label, feature_field)
                dict_key = (feature.name, dtype_str)
                if dict_key not in result:
                    result[dict_key] = (
                        set() if not dtype_str.startswith("list[cat") else []
                    )
                if dtype_str.startswith("list[cat"):
                    result[dict_key].append(label_name)
                else:
                    result[dict_key].add(label_name)
    return dict(result)


def get_non_categoricals(
    self,
) -> dict[tuple[str, str], set[Any]]:
    """Get non-categorical features and their values."""
    import pandas as pd

    from .artifact import Artifact
    from .record import Record
    from .run import Run

    non_categoricals = {}

    if self.id is not None and isinstance(self, (Artifact, Run, Record)):
        if isinstance(self, Record):
            json_values = self.values_json.values(
                "feature__name", "feature___dtype_str", "value"
            ).order_by("feature__name")
        else:
            json_values = (
                self.json_values.values("feature__name", "feature___dtype_str")
                .annotate(values=custom_aggregate("value", self._state.db))
                .order_by("feature__name")
            )

        for fv in json_values:
            feature_name = fv["feature__name"]
            feature_dtype = fv["feature___dtype_str"]
            if isinstance(self, Record):
                values = fv["value"]
            else:
                values = fv["values"]

            if connections[self._state.db].vendor == "sqlite":
                # undo GROUP_CONCAT
                if isinstance(values, str):
                    values = {value.strip('"') for value in values.split(", ")}

            # Convert single values to sets
            if not isinstance(values, (list, dict, set)):
                values = {values}
            elif (
                isinstance(values, list)
                and feature_dtype != "dict"
                and not feature_dtype.startswith("list")
            ):
                try:
                    values = set(values)
                except TypeError:
                    # TypeError: unhashable type: 'list' if values is list[list]
                    pass

            # Handle special datetime types
            if feature_dtype == "datetime":
                values = {datetime.fromisoformat(value) for value in values}
            if feature_dtype == "date":
                # date.fromisoformat() cannot handle cases like 2025-01-17T00:00:00.000Z
                values = {
                    pd.to_datetime(value, format="ISO8601").date() for value in values
                }
            if connections[self._state.db].vendor == "sqlite":
                # undo GROUP_CONCAT
                if feature_dtype == "int":
                    values = {int(value) for value in values}
                if feature_dtype == "float":
                    values = {float(value) for value in values}
                if feature_dtype == "num":
                    values = {float(value) for value in values}

            non_categoricals[(feature_name, feature_dtype)] = values

    return non_categoricals


def create_feature_table(
    name: str, registry_str: str, data: list, show_header: bool = False
) -> Table:
    """Create a Rich table for a feature group."""
    table = Table(
        Column(name, style="", no_wrap=True, width=NAME_WIDTH),
        Column(registry_str, style="dim", no_wrap=True, width=TYPE_WIDTH),
        Column("", width=VALUES_WIDTH, no_wrap=True),
        show_header=show_header,
        box=None,
        pad_edge=False,
    )
    for row in data:
        table.add_row(*row)
    return table


def get_features_data(
    self: Artifact | Run | Record,
    related_data: dict | None = None,
    to_dict: bool = False,
    external_only: bool = False,
):
    from .artifact import Artifact

    dictionary: dict[str, Any] = {}

    if self._state.adding:
        if to_dict:
            return dictionary
        else:
            raise NotImplementedError

    # feature sets
    schema_data: dict[str, tuple[str, list[str]]] = {}
    feature_data: dict[str, tuple[str, list[str]]] = {}
    if not to_dict and isinstance(self, Artifact):
        if self.id is not None and connections[self._state.db].vendor == "postgresql":
            if not related_data:
                artifact_meta = get_artifact_or_run_with_related(
                    self,
                    include_schema=True,
                    include_m2m=True,
                    include_feature_link=True,
                )
                related_data = artifact_meta.get("related_data", {})
            fs_data = related_data.get("m2m_schemas", {}) if related_data else {}
            for fs_id, (slot, data) in fs_data.items():
                for registry_str, feature_names in data.items():
                    # prevent projects show up as features
                    if registry_str == "Project":
                        continue
                    schema = Schema.objects.using(self._state.db).get(id=fs_id)
                    schema_data[slot] = (schema, feature_names)
                    for feature_name in feature_names:
                        feature_data[feature_name] = (slot, registry_str)
            schema_data.update(
                {
                    slot: (schema, schema.n_members)  # type: ignore
                    for slot, schema in get_schema_by_slot_(self).items()
                    if slot not in schema_data
                }
            )
        else:
            for slot, schema in get_schema_by_slot_(self).items():
                features = schema.members
                if features.exists():
                    # features.first() is a lot slower than features[0] here
                    name_field = get_name_field(features[0])
                    feature_names = list(
                        features.values_list(name_field, flat=True)[:20]
                    )
                    schema_data[slot] = (schema, feature_names)
                    for feature_name in feature_names:
                        feature_data[feature_name] = (slot, schema.itype)
                else:
                    schema_data[slot] = (schema, schema.n_members)

    internal_feature_names = {}
    if isinstance(self, Artifact):
        inferred_schemas = self.schemas.filter(itype="Feature")
        if len(inferred_schemas) > 0:
            for schema in inferred_schemas:
                # Use _dtype_str instead of dtype, and format for display
                feature_dtypes = dict(schema.members.values_list("name", "_dtype_str"))
                # Format Record[uid] to Record[TypeName] for display
                formatted_dtypes = {
                    name: format_dtype_for_display(dtype_str) if dtype_str else ""
                    for name, dtype_str in feature_dtypes.items()
                }
                internal_feature_names.update(formatted_dtypes)

    # categorical feature values
    # Get the categorical data using the appropriate method
    # e.g. categoricals = {('tissue', 'cat[bionty.Tissue.ontology_id]'): {'brain'}, ('cell_type', 'cat[bionty.CellType]'): {'neuron'}}
    if not self._state.adding and connections[self._state.db].vendor == "postgresql":
        categoricals = get_categoricals_postgres(
            self,
            related_data=related_data,
        )
    else:
        categoricals = get_categoricals_sqlite(
            self,
        )

    # Get non-categorical features
    non_categoricals = get_non_categoricals(
        self,
    )

    internal_feature_labels = {}
    external_data = []
    for features, is_categoricals in [(categoricals, True), (non_categoricals, False)]:
        for (feature_name, feature_dtype), values in sorted(features.items()):
            # Handle dictionary conversion
            if feature_dtype.startswith("list[cat"):
                converted_values = values  # is already a list
            else:
                converted_values = values if len(values) > 1 else next(iter(values))
            if to_dict:
                dictionary[feature_name] = converted_values
                continue

            # Format message
            if is_categoricals and isinstance(converted_values, set):
                printed_values = _format_values(
                    sorted(converted_values), n=10, quotes=False
                )
            elif (
                not is_categoricals
                and not feature_dtype.startswith(("list", "dict"))
                and isinstance(converted_values, set)
            ):
                printed_values = _format_values(
                    sorted(converted_values), n=10, quotes=False
                )
            else:
                printed_values = str(converted_values)

            # Format dtype for display (replace Record[uid] with Record[TypeName])
            display_dtype = format_dtype_for_display(feature_dtype)

            # Sort into internal/external
            feature_info = (
                feature_name,
                Text(strip_cat(display_dtype), style="dim"),
                printed_values,
            )
            if feature_name in internal_feature_names:
                internal_feature_labels[feature_name] = feature_info
            else:
                external_data.append(feature_info)

    if to_dict:
        if external_only:
            return {
                k: v for k, v in dictionary.items() if k not in internal_feature_names
            }
        else:
            return dictionary
    else:
        return (
            internal_feature_labels,
            feature_data,
            schema_data,
            internal_feature_names,
            external_data,
        )


def describe_features(
    self: Artifact | Run | Record,
    related_data: dict | None = None,
) -> tuple[Tree | None, Tree | None]:
    """Describe features of an artifact or collection."""
    if self._state.adding:
        return None, None
    (
        internal_feature_labels,
        feature_data,
        schema_data,
        internal_feature_names,
        external_data,
    ) = get_features_data(
        self,
        related_data=related_data,
    )

    # Dataset features section
    # internal features that contain labels (only `Feature` features contain labels)
    internal_feature_labels_slot: dict[str, list] = {}
    for feature_name, feature_row in internal_feature_labels.items():
        slot, _ = feature_data.get(feature_name)
        internal_feature_labels_slot.setdefault(slot, []).append(feature_row)

    dataset_features_tree_children = []
    for slot, (schema, feature_names_or_n) in schema_data.items():
        if feature_names_or_n is None or isinstance(feature_names_or_n, int):
            feature_rows = []
        else:
            feature_names = feature_names_or_n
            if slot in internal_feature_labels_slot:
                # add internal Feature features with labels
                feature_rows = internal_feature_labels_slot[slot]
                # add internal Feature features without labels
                feature_rows += [
                    (
                        feature_name,
                        Text(
                            strip_cat(internal_feature_names.get(feature_name)),
                            style="dim",
                        ),
                        "",
                    )
                    for feature_name in feature_names
                    if feature_name and feature_name not in internal_feature_labels
                ]
            else:
                # add internal non-Feature features without labels
                feature_rows = [
                    (
                        feature_name,
                        Text(
                            strip_cat(
                                internal_feature_names.get(feature_name)
                                if feature_name in internal_feature_names
                                else schema.dtype
                            ),
                            style="dim",
                        ),
                        "",
                    )
                    for feature_name in feature_names
                    if feature_name
                ]
            feature_rows.sort(key=lambda x: x[0])
        schema_itype = f" {schema.itype}" if schema.itype != "Feature" else ""
        dataset_features_tree_children.append(
            create_feature_table(
                Text.assemble(
                    (slot, "violet"),
                    (f" ({schema.n_members}{schema_itype})", "dim"),
                ),
                "",
                feature_rows,
                show_header=True,
            )
        )
    # external features
    external_features_tree_children = []
    if external_data:
        external_features_tree_children.append(
            create_feature_table(
                "",
                "",
                external_data,
            )
        )

    # trees
    dataset_features_tree = None
    if dataset_features_tree_children:
        dataset_features_tree = Tree(
            Text("Dataset features", style="bold bright_magenta")
        )
        for child in dataset_features_tree_children:
            dataset_features_tree.add(child)
    external_features_tree = None
    if external_features_tree_children:
        external_features_text = (
            "External features"
            if (
                self.__class__.__name__ == "Artifact" and dataset_features_tree_children
            )
            else "Features"
        )
        external_features_tree = Tree(
            Text(external_features_text, style="bold dark_orange")
        )
        for child in external_features_tree_children:
            external_features_tree.add(child)
    return dataset_features_tree, external_features_tree


def infer_convert_dtype_key_value(
    key: str, value: Any, mute: bool = False, dtype_str: str | None = None
) -> tuple[str, Any, str]:
    import pandas as pd

    from lamindb.base.dtypes import is_valid_datetime_str

    message = ""
    if isinstance(value, bool):
        return "bool", value, message
    elif isinstance(value, int):
        return "int", value, message
    elif isinstance(value, float):
        return "float", value, message
    elif isinstance(value, datetime):
        return "datetime", value.isoformat(), message
    elif isinstance(value, date):
        return "date", value.isoformat(), message
    elif isinstance(value, str):
        if dtype_str in {None, "datetime", "date"} and (
            datetime_str := is_valid_datetime_str(value)
        ):
            dt_type = (
                "date" if len(value) == 10 else "datetime"
            )  # YYYY-MM-DD is exactly 10 characters
            sanitized_value = datetime_str[:10] if dt_type == "date" else datetime_str  # type: ignore
            return dt_type, sanitized_value, message  # type: ignore
        else:
            return "cat ? str", value, message
    elif isinstance(value, SQLRecord):
        # SQLRecord is not converted to JSON
        return (f"cat[{value.__class__.__get_name_with_module__()}]", value, message)
    elif isinstance(value, (Path, UPath)):
        return "path", value.as_posix().rstrip("/"), message
    elif isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
        if isinstance(value, (pd.Series, np.ndarray, pd.Categorical)):
            dtype = serialize_pandas_dtype(value.dtype)
            if dtype == "str":
                # ndarray doesn't know categorical, so there was no conscious choice
                # offer both options
                if isinstance(value, np.ndarray):
                    dtype = "cat ? str"
                else:
                    # suggest to create a categorical if there are few unique values
                    message = suggest_categorical_for_str_iterable(value, key)
                    if message:
                        message = f"  # {message}"
            return dtype, list(value), message
        if isinstance(value, dict):
            return "dict", value, message
        if len(value) > 0:  # type: ignore
            first_element = next(iter(value))
            first_element_type = type(first_element)
            # check that all elements are of the same type
            if all(isinstance(elem, first_element_type) for elem in value):
                if first_element_type is bool:
                    return "list[bool]", value, message
                elif first_element_type is int:
                    return "list[int]", value, message
                elif first_element_type is float:
                    return "list[float]", value, message
                elif first_element_type is str:
                    return ("list[cat ? str]", value, message)
                elif isinstance(first_element, SQLRecord):
                    return (
                        f"list[cat[{first_element_type.__get_name_with_module__()}]]",
                        value,
                        message,
                    )
    if not mute:
        logger.warning(f"cannot infer feature type of: {value}, returning '?'")
    return "?", value, message


def _filter_one_feature_clause(
    queryset: BasicQuerySet,
    feature: Feature,
    comparator: str,
    value: Any,
) -> BasicQuerySet:
    from lamindb.models import Artifact
    from lamindb.models.record import Record, RecordJson
    from lamindb.models.run import Run

    dtype_str = feature._dtype_str
    # non-categorical features
    if not dtype_str.startswith("cat") and not dtype_str.startswith("list[cat"):
        if comparator == "__isnull":
            if queryset.model is Artifact:
                from .artifact import ArtifactJsonValue

                value_subquery = ArtifactJsonValue.objects.filter(
                    jsonvalue__feature=feature
                ).values("artifact_id")
                return queryset.exclude(id__in=Subquery(value_subquery))

        if comparator in {"__startswith", "__contains"}:
            logger.important(
                f"currently not supporting `{comparator}`, using `__icontains` instead"
            )
            comparator = "__icontains"
        use_numeric_sqlite = (
            connections[feature._state.db].vendor == "sqlite"
            and comparator in {"__gt", "__lt", "__gte", "__lte"}
            and dtype_str in ("int", "float", "num")
        )
        if use_numeric_sqlite:
            # Numeric comparison via json_extract + CAST (avoids lexicographic comparison)
            num_val_raw = RawSQL("CAST(json_extract(value, '$') AS REAL)", ())
            if queryset.model is Record:
                value_qs = (
                    RecordJson.objects.using(queryset.db)
                    .filter(feature=feature)
                    .annotate(num_val=num_val_raw)
                    .filter(**{f"num_val{comparator}": value})
                )
                return queryset.filter(values_json__id__in=value_qs)
            else:
                json_values = (
                    JsonValue.objects.using(queryset.db)
                    .filter(feature=feature)
                    .annotate(num_val=num_val_raw)
                    .filter(**{f"num_val{comparator}": value})
                )
                accessor = (
                    "json_values"
                    if queryset.model in {Artifact, Run}
                    else "values_json"
                )
                return queryset.filter(**{f"{accessor}__id__in": json_values})
        else:
            if connections[feature._state.db].vendor == "sqlite" and comparator in {
                "__gt",
                "__lt",
                "__gte",
                "__lte",
            }:
                # SQLite: lexicographic comparison for non-numeric dtypes (date, datetime, str)
                value = str(value)
            filter_expr = {"feature": feature, f"value{comparator}": value}
            if queryset.model is Record:
                value_qs = RecordJson.objects.using(queryset.db).filter(**filter_expr)
                return queryset.filter(values_json__id__in=value_qs)
            else:
                json_values = JsonValue.objects.using(queryset.db).filter(**filter_expr)
                accessor = (
                    "json_values"
                    if queryset.model in {Artifact, Run}
                    else "values_json"
                )
                return queryset.filter(**{f"{accessor}__id__in": json_values})
    # categorical features
    elif isinstance(value, (str, SQLRecord, bool)):
        result = parse_dtype(dtype_str)[0]
        label_registry = result["registry"]
        _, value_field_name, filter_accessor_name = get_categorical_link_info(
            queryset.model, label_registry, instance=queryset.db
        )
        if comparator == "__isnull":
            kwargs = {f"{filter_accessor_name}__feature": feature}
            if value:  # True
                return queryset.exclude(**kwargs)
            else:
                return queryset.filter(**kwargs)
        # because SQL is sensitive to whether querying with __in or not
        # and might return multiple equivalent records for the latter
        # we distinguish cases in which we have multiple label matches vs. one
        label = None
        labels = None
        if isinstance(value, str):
            field_name = result["field"].field.name
            # users might query like so:
            # ln.Artifact.filter(experiment__contains="Experi")
            expression = {f"{field_name}{comparator}": value}
            labels = result["registry"].connect(queryset.db).filter(**expression)
            if len(labels) == 0:
                raise DoesNotExist(
                    f"Did not find a {label_registry.__name__} matching `{field_name}{comparator}={value}`"
                )
            elif len(labels) == 1:
                label = labels[0]
        elif isinstance(value, SQLRecord):
            label = value
        new_expression = {f"{filter_accessor_name}__feature": feature}
        if label is not None:
            new_expression[f"{filter_accessor_name}__{value_field_name}"] = label
        else:
            new_expression[f"{filter_accessor_name}__{value_field_name}__in"] = labels
        return queryset.filter(**new_expression)
    raise NotImplementedError


def filter_with_feature_predicates(
    queryset: BasicQuerySet,
    predicates: list[FeaturePredicate],
) -> BasicQuerySet:
    qs = queryset
    pk_name = qs.model._meta.pk.name
    for predicate in predicates:
        feature = predicate.feature
        if qs.db is not None and feature._state.db != qs.db:
            feature = Feature.connect(qs.db).get(uid=feature.uid)
        if predicate.comparator == "__ne":
            subset = _filter_one_feature_clause(
                qs, feature=feature, comparator="", value=predicate.value
            )
            qs = qs.exclude(**{f"{pk_name}__in": Subquery(subset.values(pk_name))})
        else:
            qs = _filter_one_feature_clause(
                qs,
                feature=feature,
                comparator=predicate.comparator,
                value=predicate.value,
            )
    return qs


def filter_base(
    queryset: BasicQuerySet,
    _skip_validation: bool = True,
    **expression,
) -> BasicQuerySet:
    from lamindb.models import BasicQuerySet, QuerySet

    assert isinstance(queryset, BasicQuerySet) and not isinstance(queryset, QuerySet)  # noqa: S101
    keys_normalized = [key.split("__")[0] for key in expression]
    if not _skip_validation:
        validated = Feature.connect(queryset.db).validate(
            keys_normalized, field="name", mute=True
        )
        if sum(validated) != len(keys_normalized):
            raise ValidationError(
                f"Some keys in the filter expression are not registered as features: {np.array(keys_normalized)[~validated]}"
            )
    features = Feature.connect(queryset.db).filter(name__in=keys_normalized).distinct()
    qs = queryset
    for key, value in expression.items():
        split_key = key.split("__")
        normalized_key = split_key[0]
        comparator = ""
        if len(split_key) == 2:
            comparator = f"__{split_key[1]}"
        feature = features.get(name=normalized_key)
        qs = _filter_one_feature_clause(
            qs, feature=feature, comparator=comparator, value=value
        )
    if qs is queryset:
        raise NotImplementedError
    return qs


def filter_with_features(
    queryset: BasicQuerySet, *queries, **expressions
) -> BasicQuerySet:
    from lamindb.models import BasicQuerySet, QuerySet

    feature_predicates = [q for q in queries if isinstance(q, FeaturePredicate)]
    non_feature_queries = [q for q in queries if not isinstance(q, FeaturePredicate)]

    if isinstance(queryset, QuerySet):
        # need to avoid infinite recursion because
        # filter_with_features is called in queryset.filter otherwise
        filter_kwargs = {"_skip_filter_with_features": True}
    else:
        filter_kwargs = {}
    registry = queryset.model
    qs = queryset
    if expressions:
        keys_normalized = [key.split("__")[0] for key in expressions]
        field_or_feature = keys_normalized[0]
        if field_or_feature in registry.__get_available_fields__():
            qs = queryset.filter(*non_feature_queries, **expressions, **filter_kwargs)
        elif all(
            features_validated := Feature.objects.using(queryset.db).validate(
                keys_normalized, field="name", mute=True
            )
        ):
            # filter_base requires qs to be BasicQuerySet
            qs = filter_base(
                queryset._to_class(BasicQuerySet, copy=True),
                _skip_validation=True,
                **expressions,
            )._to_class(type(queryset), copy=False)
            qs = qs.filter(*non_feature_queries, **filter_kwargs)
        else:
            features = ", ".join(sorted(np.array(keys_normalized)[~features_validated]))
            message = f"feature names: {features}"
            avail_fields = registry.__get_available_fields__()
            fields = ", ".join(sorted(avail_fields))
            raise InvalidArgument(
                f"You can query either by available fields: {fields}\n"
                f"Or fix invalid {message}"
            )
    else:
        # Always route through `.filter()` here (even when empty) so the
        # standard QuerySet path can inject default branch constraints.
        qs = queryset.filter(*non_feature_queries, **filter_kwargs)
    if feature_predicates:
        qs = filter_with_feature_predicates(
            qs._to_class(BasicQuerySet, copy=True),
            feature_predicates,
        )._to_class(type(qs), copy=False)
    return qs


class FeatureManager:
    """Feature manager."""

    def __init__(self, sqlrecord: Artifact | Run | Record):
        # host is the sqlrecord that the label manager is attached to
        # we might rename _host to _sqlrecord in the future
        self._host = sqlrecord
        self._slots: dict[str, Schema] | None = None
        self._accessor_by_registry_ = None

    def __repr__(self) -> str:
        return self.describe(return_str=True)  # type: ignore

    def describe(self, return_str: bool = False) -> str | None:
        """Pretty print features.

        This is what `artifact.describe()` calls under the hood.
        """
        dataset_features_tree, external_features_tree = describe_features(self._host)  # type: ignore
        tree = describe_header(self._host)
        if dataset_features_tree:
            tree.add(dataset_features_tree)
        if external_features_tree:
            tree.add(external_features_tree)
        return format_rich_tree(tree, return_str=return_str)

    def get_values(self, external_only: bool = False) -> dict[str, Any]:
        """Get features as a dictionary.

        Includes annotation with internal and external feature values.

        Args:
            external_only: If `True`, only return external feature annotations.
        """
        return get_features_data(self._host, to_dict=True, external_only=external_only)  # type: ignore

    def __getitem__(
        self, feature: str
    ) -> (
        DtypeObject
        | BasicQuerySet
        | SQLRecord
        | SQLRecordList
        | dict[str, DtypeObject | BasicQuerySet | SQLRecord | SQLRecordList]
    ):
        """Get values by feature name.

        Args:
            feature: Feature name.

        Returns:
            - For categorical features, return value records.
            - For non-categorical features, return values.

        Example::

            artifact.features['tissue']
            #> Tissue(id=1, name='brain', ...)
        """
        from collections import defaultdict

        import pandas as pd

        from .query_set import SQLRecordList

        host_name = self._host.__class__.__name__
        host_id = self._host.id
        host_db = self._host._state.db
        feature_records = list(Feature.objects.using(host_db).filter(name=feature))
        if not feature_records:
            raise ValidationError(f"Feature with name {feature} not found")

        # group cat feature_records by their registry
        registry_to_features = defaultdict(list)
        for feature_record in feature_records:
            parsed_dtype = parse_dtype(feature_record._dtype_str)
            if len(parsed_dtype) > 0:  # categorical features
                registry = parsed_dtype[0]["registry"]
                registry_name = registry.__get_name_with_module__()
                registry_to_features[(registry, registry_name)].append(
                    feature_record.id
                )
            else:  # non-categorical features
                registry_to_features[(JsonValue, "JsonValue")].append(feature_record.id)

        value_records = {}

        # query once per registry with all feature_ids
        for (registry, registry_name), feature_ids in registry_to_features.items():
            if registry_name == "JsonValue":
                # for non-categorical features
                filters = {
                    "feature_id__in": feature_ids,
                    f"links_{host_name.lower()}__{host_name.lower()}_id": host_id,
                }
                dtype_values = (
                    registry.objects.using(host_db)
                    .filter(**filters)
                    .distinct()
                    .values_list("feature___dtype_str", "value")
                )
                feature_values_qs = []
                for dtype, value in dtype_values:
                    if dtype == "date":
                        value = pd.to_datetime(value, format="ISO8601").date()
                    elif dtype == "datetime":
                        value = datetime.fromisoformat(value)
                    feature_values_qs.append(value)
            else:
                # determine links name once per registry
                links_value_name = (
                    "links_value"
                    if registry_name == host_name
                    else f"links_{host_name.lower()}"
                )

                filters = {
                    f"{links_value_name}__feature_id__in": feature_ids,
                    f"{links_value_name}__{host_name.lower()}_id": host_id,
                }

                feature_values_qs = (
                    registry.objects.using(host_db).filter(**filters).distinct()
                )

            if len(feature_values_qs) == 1:
                value_records[registry_name] = feature_values_qs[0]
            elif len(feature_values_qs) > 1:
                if feature_record.dtype_as_str.startswith("list["):
                    value_records[registry_name] = SQLRecordList(feature_values_qs)
                else:
                    value_records[registry_name] = feature_values_qs

        return (
            next(iter(value_records.values()))
            if len(value_records) == 1
            else value_records
        )

    @property
    def slots(self) -> dict[str, Schema]:
        """Features by schema slot.

        Example::

            artifact.features.slots
            #> {'var': <Schema: var>, 'obs': <Schema: obs>}
        """
        if self._slots is None:
            self._slots = get_schema_by_slot_(self._host)
        return self._slots

    @property
    def _accessor_by_registry(self):
        """Accessor by registry."""
        if self._accessor_by_registry_ is None:
            self._accessor_by_registry_ = get_accessor_by_registry_(self._host)
        return self._accessor_by_registry_

    def _add_label_feature_links(
        self,
        features_labels,
    ):
        host_name = self._host.__class__.__name__.lower()
        host_is_record = host_name == "record"
        instance = getattr(self._host._state, "db", None)
        for class_name, registry_features_labels in features_labels.items():
            if not host_is_record and class_name == "Collection":
                continue
            registry_features_labels[0][0]
            label_registry = registry_features_labels[0][1].__class__
            link_model, value_field_name, _ = get_categorical_link_info(
                self._host.__class__, label_registry, instance=instance
            )
            field_name = f"{value_field_name}_id"
            host_fk = f"{host_name}_id"
            links = [
                link_model(
                    **{
                        host_fk: self._host.id,
                        "feature_id": ftr.id,
                        field_name: label.id,
                    }
                )
                for (ftr, label) in registry_features_labels
            ]
            try:
                save(links, ignore_conflicts=False)
            except Exception:
                save(links, ignore_conflicts=True)

    def _get_feature_objects(self, dictionary, feature_field):
        from ..core._functions import get_current_tracked_run

        registry = feature_field.field.model
        keys = list(dictionary.keys())
        feature_objects = registry.from_values(keys, field=feature_field, mute=True)
        feature_objects = keep_topmost_matches(feature_objects)
        if len(feature_objects) != len(keys):
            not_validated_keys = [
                key for key in keys if key not in feature_objects.to_list("name")
            ]
            not_validated_keys_dtype_message = [
                (key, infer_convert_dtype_key_value(key, dictionary[key]))
                for key in not_validated_keys
            ]
            run = get_current_tracked_run()
            if run is not None:
                name = f"{run.transform.kind}[{run.transform.key}]"
                type_hint = f"""  feature_type = ln.Feature(name='{name}', is_type=True).save()"""
                elements = [type_hint]
                type_kwarg = ", type=feature_type"
            else:
                elements = []
                type_kwarg = ""
            elements += [
                f"  ln.Feature(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}"
                for key, (dtype, _, message) in not_validated_keys_dtype_message
            ]
            hint = "\n".join(elements)
            msg = (
                f"These keys could not be validated: {not_validated_keys}\n"
                f"Here is how to create a feature:\n\n{hint}"
            )
            raise ValidationError(msg)
        return feature_objects

    def _resolve_feature_value_dictionary(
        self,
        values: dict[str | Feature, Any],
    ) -> tuple[dict[str, Any], dict[str, Any], list[Feature], dict[str, Any]]:
        """Normalize a feature-value dictionary to support `str` and `Feature` keys.

        Returns:
            normalized_values: Values keyed by feature name (used by schema validators).
            string_key_values: Subset of values that came from string keys only.
            explicit_features: Resolved Feature objects passed explicitly as keys.
            values_by_feature_uid: Values keyed by feature uid (used for exact lookup).
        """
        host_db = self._host._state.db
        normalized_values: dict[str, Any] = {}
        string_key_values: dict[str, Any] = {}
        explicit_features: list[Feature] = []
        values_by_feature_uid: dict[str, Any] = {}
        seen_explicit_uids: set[str] = set()

        for key, value in values.items():
            if isinstance(key, Feature):
                if key._state.adding:
                    raise ValidationError(
                        f"Please save feature '{key.name}' before annotation."
                    )
                feature = key
                # Mirror feature predicate resolution: resolve Feature objects on active DB.
                if host_db is not None and feature._state.db != host_db:
                    feature = Feature.connect(host_db).get(uid=feature.uid)
                if feature.uid in values_by_feature_uid and (
                    values_by_feature_uid[feature.uid] != value
                ):
                    raise ValidationError(
                        f"Conflicting values for feature '{feature.name}'."
                    )
                values_by_feature_uid[feature.uid] = value
                if feature.uid not in seen_explicit_uids:
                    explicit_features.append(feature)
                    seen_explicit_uids.add(feature.uid)
                if (
                    feature.name in normalized_values
                    and normalized_values[feature.name] != value
                ):
                    raise ValidationError(
                        f"Conflicting values for feature name '{feature.name}'."
                    )
                normalized_values[feature.name] = value
            elif isinstance(key, str):
                if key in normalized_values and normalized_values[key] != value:
                    raise ValidationError(
                        f"Conflicting values for feature name '{key}'."
                    )
                normalized_values[key] = value
                string_key_values[key] = value
            else:
                raise TypeError(
                    "Feature-value dictionary keys must be `str` or `Feature`, "
                    f"got {type(key)}"
                )

        return (
            normalized_values,
            string_key_values,
            explicit_features,
            values_by_feature_uid,
        )

    @staticmethod
    def _merge_feature_objects(
        explicit_features: list[Feature],
        looked_up_features,
    ) -> list[Feature]:
        merged: list[Feature] = []
        seen_uids: set[str] = set()
        for feature in explicit_features:
            if feature.uid not in seen_uids:
                merged.append(feature)
                seen_uids.add(feature.uid)
        for feature in looked_up_features:
            if feature.uid not in seen_uids:
                merged.append(feature)
                seen_uids.add(feature.uid)
        return merged

    @staticmethod
    def _raise_not_validated_values(
        not_validated_values: dict[str, tuple[str, list[str]]],
    ) -> None:
        if not not_validated_values:
            return None
        hint = ""
        for key, (field, values_list) in not_validated_values.items():
            key_str = "ln.Record" if key == "Record" else key
            create_true = ", create=True" if "bionty." not in key else ""
            hint += f"  records = {key_str}.from_values({values_list}, field='{field}'{create_true}).save()\n"
        msg = (
            f"These values could not be validated: {dict(not_validated_values)}\n"
            f"Here is how to create records for them:\n\n{hint}"
        )
        raise ValidationError(msg)

    def _collect_record_feature_writes(
        self,
        *,
        record,
        feature_objects: list[Feature],
        dictionary: dict[str, Any],
        values_by_feature_uid: dict[str, Any] | None,
        feature_json_values: list,
        links_by_model: dict,
        not_validated_values: dict[str, tuple[str, list[str]]],
        resolved_records_by_feature_id: dict[int, dict[Any, list[SQLRecord]]]
        | None = None,
    ) -> None:
        from ..base.dtypes import is_iterable_of_sqlrecord
        from .can_curate import CanCurate
        from .record import RecordJson

        for feature in feature_objects:
            if (
                values_by_feature_uid is not None
                and feature.uid in values_by_feature_uid
            ):
                value = values_by_feature_uid[feature.uid]
            else:
                value = dictionary[feature.name]
            if value is None:
                continue
            if not (
                feature.dtype_as_str.startswith("cat")
                or feature.dtype_as_str.startswith("list[cat")
            ):
                _, converted_value, _ = infer_convert_dtype_key_value(
                    key=feature.name, value=value, dtype_str=feature.dtype_as_str
                )
                feature_json_values.append(
                    RecordJson(record=record, feature=feature, value=converted_value)
                )
                continue

            if isinstance(value, SQLRecord) or is_iterable_of_sqlrecord(value):
                if isinstance(value, SQLRecord):
                    label_records = [value]
                else:
                    label_records = value  # type: ignore
            else:
                if isinstance(value, str):
                    values = [value]  # type: ignore
                else:
                    values = value  # type: ignore
                if feature._dtype_str == "cat":
                    feature._dtype_str = "cat[ULabel]"
                    feature.save()
                    result = {
                        "registry_str": "ULabel",
                        "registry": ULabel,
                        "field": ULabel.name,
                    }
                else:
                    result = parse_dtype(feature._dtype_str)[0]
                # Fast path for dataframe-originated record batches:
                # `bulk_set_features_in_records()` now runs a single `DataFrameCurator`
                # pass and pre-resolves categorical values to label records.
                #
                # The cache key is feature.id and the nested key is the normalized
                # raw value found in the dataframe. Using this cache here avoids
                # running per-row `validate()` + `from_values()` calls, which used
                # to duplicate work already done by the curator.
                cached_records = None
                if (
                    resolved_records_by_feature_id is not None
                    and feature.id in resolved_records_by_feature_id
                ):
                    cached_records = resolved_records_by_feature_id[feature.id]
                if cached_records is not None:
                    if isinstance(value, str):
                        values_for_lookup = [value]
                    else:
                        values_for_lookup = value  # type: ignore
                    if isinstance(values_for_lookup, (list, tuple, np.ndarray, set)):
                        values_for_lookup = list(values_for_lookup)
                    else:
                        values_for_lookup = [values_for_lookup]
                    label_records = []
                    not_validated_for_feature = []
                    for lookup_value in values_for_lookup:
                        normalized_lookup = (
                            lookup_value.item()
                            if isinstance(lookup_value, np.generic)
                            else lookup_value
                        )
                        mapped_records = cached_records.get(normalized_lookup)
                        if mapped_records is None:
                            # Keep the same error aggregation behavior as before:
                            # unresolved categorical values are collected and raised
                            # in one ValidationError after all records are processed.
                            not_validated_for_feature.append(normalized_lookup)
                        else:
                            label_records.extend(mapped_records)
                    if not_validated_for_feature:
                        not_validated_values[result["registry_str"]] = (  # type: ignore
                            result["field_str"],
                            not_validated_for_feature,
                        )
                elif issubclass(result["registry"], CanCurate):  # type: ignore
                    # Fallback path for non-batch callers (e.g. direct
                    # `record.features.add_values()` on an individual record).
                    #
                    # Those flows do not build dataframe-level caches, so we keep
                    # the original registry-backed validation and resolution logic.
                    # This branch should not be hot for the dataframe batch import
                    # path because that path provides `resolved_records_by_feature_id`.
                    validated = result["registry"].validate(  # type: ignore
                        values, field=result["field"], mute=True
                    )
                    values_array = np.array(values)
                    validated_values = values_array[validated]
                    if validated.sum() != len(values):
                        not_validated_values[result["registry_str"]] = (  # type: ignore
                            result["field_str"],
                            values_array[~validated].tolist(),
                        )
                    label_records = result["registry"].from_values(  # type: ignore
                        validated_values, field=result["field"], mute=True
                    )
                else:
                    label_records = result["registry"].filter(  # type: ignore
                        **{f"{result['field_str']}__in": values}
                    )
                    if len(label_records) != len(values):
                        raise ValidationError(
                            f"Some of these values for {result['registry_str']} do not exist: {values}"
                        )
            for label_record in label_records:
                if label_record._state.adding:
                    raise ValidationError(
                        f"Please save {label_record} before annotation."
                    )
                link_model, value_field_name, _ = get_categorical_link_info(
                    record.__class__,
                    label_record.__class__,
                    instance=getattr(record._state, "db", None),
                )
                links_by_model[link_model].append(
                    link_model(
                        record_id=record.id,
                        feature_id=feature.id,
                        **{f"{value_field_name}_id": label_record.id},
                    )
                )
        return None

    def add_values(
        self,
        values: dict[str | Feature, Any],
        feature_field: FieldAttr = Feature.name,
        schema: Schema = None,
    ) -> None:
        """Add values for features.

        Like `set_values()`, but slightly more performant because it does not remove previously-existing feature annotations at the danger
        of violating multiplicity of categorical dtypes (see warning below).

        Args:
            values: A dictionary of keys (features) & values (labels, strings, numbers, booleans, datetimes, etc.).
                Keys can be feature names (`str`) or `Feature` objects.
                If a value is `None`, it will be skipped.
            feature_field: The field of a registry to map the keys of the `values` dictionary in case strings are passed.
            schema: Schema to validate against.

        .. warning::

            If you run::

                obj.features.add_values({"my_categorical": "my_category1"})
                obj.features.add_values({"my_categorical": "my_category2"})

            you will annotate the object with two different values for the same feature even if its dtype is not a `list`.
            That is, `add_values()` does **not** validate the `dtype` of a categorical feature across multiple calls.

            To avoid this, please use `set_values()`.

        .. dropdown:: Why is multiplicity of categorical dtypes not validated?

            For simple data types like `int`, `date`, `dict`, etc., `add_values()` ensures that there is only
            one value for a given `Record` and feature.

            But for categorical/relational features or for simple dtypes in the context of annotating an `Artifact`, the underlying link table allows linking multiple
            values to the same object and feature, so that both `list` dtypes and `set`-like aggregations on an object
            can be represented with relational integrity.

            Examples::

                # the following needs to be allowed even if `cell_type` has dtype `CellType`, and not `list[CellType]`
                # this is because the artifact might be a `DataFrame` with a column `cell_type` that has dtype `CellType`
                # and the annotations on the artifact-level represent the aggregation of all values in that column
                artifact.features.add_values({"cell_type": "B cell"})
                artifact.features.add_values({"cell_type": "T cell"})
                artifact.features.add_values({"cell_type": "NK cell"})

                # now an example for Record
                # while a record will never represent an aggregation, we still want to express
                # lists of values with relational integrity, for instance, this
                record.features.add_values({"cell_types": ["B cell", "T cell", "NK cell"]})

        """
        from lamindb.curators.core import ExperimentalDictCurator

        host_is_record = self._host.__class__.__name__ == "Record"
        host_is_artifact = self._host.__class__.__name__ == "Artifact"
        # rename to distinguish from the values inside the dict
        (
            dictionary,
            string_key_values,
            explicit_features,
            values_by_feature_uid,
        ) = self._resolve_feature_value_dictionary(values)
        keys = dictionary.keys()
        if isinstance(keys, DICT_KEYS_TYPE):
            keys = list(keys)  # type: ignore
        if (
            host_is_record
            and self._host.type is not None
            and self._host.type.schema is not None  # type: ignore
        ):
            assert schema is None, "Cannot pass schema if record.type has schema."
            schema = self._host.type.schema  # type: ignore
        if host_is_artifact:
            if self._get_external_schema():
                raise ValueError("Cannot add values if artifact has external schema.")
        if schema is not None:
            member_ids = set(schema.members.values_list("id", flat=True))
            features_not_in_schema = [
                feature.name
                for feature in explicit_features
                if feature.id not in member_ids
            ]
            if features_not_in_schema:
                raise ValidationError(
                    "These feature keys are not in the provided schema: "
                    f"{features_not_in_schema}"
                )
            looked_up_features = schema.members.filter(name__in=keys)
            feature_objects = self._merge_feature_objects(
                explicit_features, looked_up_features
            )
        else:
            if string_key_values:
                looked_up_features = self._get_feature_objects(
                    string_key_values, feature_field
                )
            else:
                looked_up_features = Feature.objects.none()
            feature_objects = self._merge_feature_objects(
                explicit_features, looked_up_features
            )
            schema = Schema(feature_objects)
        ExperimentalDictCurator(
            dictionary, schema, require_saved_schema=False
        ).validate()
        return self._add_values(
            feature_objects,
            dictionary,
            values_by_feature_uid=values_by_feature_uid,
        )

    def _add_values(
        self,
        feature_objects,
        dictionary,
        *,
        values_by_feature_uid: dict[str, Any] | None = None,
    ):
        from ..base.dtypes import is_iterable_of_sqlrecord
        from .can_curate import CanCurate

        host_is_record = self._host.__class__.__name__ == "Record"
        if host_is_record:
            feature_json_values: list[SQLRecord] = []
            links_by_model: dict[type[SQLRecord], list[SQLRecord]] = defaultdict(list)
            record_not_validated_values: dict[str, tuple[str, list[str]]] = {}
            self._collect_record_feature_writes(
                record=self._host,
                feature_objects=feature_objects,
                dictionary=dictionary,
                values_by_feature_uid=values_by_feature_uid,
                feature_json_values=feature_json_values,
                links_by_model=links_by_model,
                not_validated_values=record_not_validated_values,
            )
            self._raise_not_validated_values(record_not_validated_values)
            if feature_json_values:
                save(feature_json_values)
            for links in links_by_model.values():
                try:
                    save(links, ignore_conflicts=False)
                except Exception:
                    save(links, ignore_conflicts=True)
            return None

        features_labels = defaultdict(list)
        feature_json_values = []
        not_validated_values: dict[str, tuple[str, list[str]]] = {}
        for feature in feature_objects:
            if (
                values_by_feature_uid is not None
                and feature.uid in values_by_feature_uid
            ):
                value = values_by_feature_uid[feature.uid]
            else:
                value = dictionary[feature.name]
            if value is None:
                continue
            if not (
                feature.dtype_as_str.startswith("cat")
                or feature.dtype_as_str.startswith("list[cat")
            ):
                _, converted_value, _ = infer_convert_dtype_key_value(
                    key=feature.name, value=value, dtype_str=feature.dtype_as_str
                )
                filter_kwargs = {"feature": feature, "value": converted_value}
                feature_value, _ = JsonValue.get_or_create(**filter_kwargs)
                feature_json_values.append(feature_value)
            else:
                if isinstance(value, SQLRecord) or is_iterable_of_sqlrecord(value):
                    if isinstance(value, SQLRecord):
                        label_records = [value]
                    else:
                        label_records = value  # type: ignore
                    for record in label_records:
                        if record._state.adding:
                            raise ValidationError(
                                f"Please save {record} before annotation."
                            )
                        features_labels[
                            record.__class__.__get_name_with_module__()
                        ].append((feature, record))
                else:
                    if isinstance(value, str):
                        values = [value]  # type: ignore
                    else:
                        values = value  # type: ignore
                    if feature._dtype_str == "cat":
                        new_dtype_str = feature._dtype_str + "[ULabel]"
                        feature._dtype_str = new_dtype_str
                        feature.save()
                        result = {
                            "registry_str": "ULabel",
                            "registry": ULabel,
                            "field": ULabel.name,
                        }
                    else:
                        result = parse_dtype(feature._dtype_str)[0]
                    if issubclass(result["registry"], CanCurate):  # type: ignore
                        validated = result["registry"].validate(  # type: ignore
                            values, field=result["field"], mute=True
                        )
                        values_array = np.array(values)
                        validated_values = values_array[validated]
                        if validated.sum() != len(values):
                            not_validated_values[result["registry_str"]] = (  # type: ignore
                                result["field_str"],
                                values_array[~validated].tolist(),
                            )
                        label_records = result["registry"].from_values(  # type: ignore
                            validated_values, field=result["field"], mute=True
                        )
                    else:
                        label_records = result["registry"].filter(  # type: ignore
                            **{f"{result['field_str']}__in": values}
                        )
                        if len(label_records) != len(values):
                            raise ValidationError(
                                f"Some of these values for {result['registry_str']} do not exist: {values}"
                            )
                    features_labels[result["registry_str"]] += [  # type: ignore
                        (feature, label_record) for label_record in label_records
                    ]
        # TODO: given we had already validated prior to calling _add_values, this block below should never be reached
        # refactor this out if possible
        self._raise_not_validated_values(not_validated_values)
        if features_labels:
            self._add_label_feature_links(features_labels)
        if feature_json_values:
            to_insertjson_values = [
                record for record in feature_json_values if record._state.adding
            ]
            if to_insertjson_values:
                save(to_insertjson_values)
            links = [
                self._host.json_values.through(
                    **{
                        f"{self._host.__class__.__name__.lower()}_id": self._host.id,
                        "jsonvalue_id": json_value.id,
                    }
                )
                for json_value in feature_json_values
            ]
            # a link might already exist, hence ignore_conflicts is needed
            save(links, ignore_conflicts=True)

    def set_values(
        self,
        values: dict[str | Feature, Any],
        feature_field: FieldAttr = Feature.name,
        schema: Schema = None,
    ) -> None:
        """Set values for features.

        Note that, in the context of annotating an `Artifact`, this does **not** affect the annotations derived from the artifact's dataset features. It only sets
        the artifact's external feature annotations.

        Args:
            values: A dictionary of keys (features) & values (labels, strings, numbers, booleans, datetimes, etc.).
                Keys can be feature names (`str`) or `Feature` objects.
                If a value is `None`, it will be skipped.
            feature_field: The field of a registry to map the keys of the `values` dictionary in case strings are passed.
            schema: Schema to validate against.

        Examples:

            Here is how to annotate an artifact ad hoc::

                artifact.features.set_values({
                    "species": "human",
                    "scientist": ['Barbara McClintock', 'Edgar Anderson'],
                    "temperature": 27.6,
                    "experiment": "Experiment 1"
                })

            Query artifacts by features::

                ln.Artifact.filter(scientist="Barbara McClintock")

            If your feature names are ambiguous, you can use a `Feature` object to disambiguate::

                temperature = ln.Feature.get(name="temperature", type__name="my_feature_type")

                # to set feature values
                artifact.features.set_values({temperature: 0.5})  # temperature is the feature object

                # to query by feature values
                ln.Artifact.filter(temperature == 0.5)  # instead of temperature=0.5

            You can pass a schema to validate the dictionary::

                schema = ln.Schema([ln.Feature(name="species", dtype=str).save()]).save()
                artifact.features.set_values({"species": "bird"}, schema=schema)

            Also see :class:`lamindb.Artifact.features`, :class:`lamindb.Record.features`, and :class:`lamindb.Run.features`.
        """
        from lamindb.curators.core import ExperimentalDictCurator

        host_is_record = self._host.__class__.__name__ == "Record"
        host_is_artifact = self._host.__class__.__name__ == "Artifact"
        # rename to distinguish from the values inside the dict
        (
            dictionary,
            string_key_values,
            explicit_features,
            values_by_feature_uid,
        ) = self._resolve_feature_value_dictionary(values)
        keys = dictionary.keys()
        if isinstance(keys, DICT_KEYS_TYPE):
            keys = list(keys)  # type: ignore
        if (
            host_is_record
            and self._host.type is not None
            and self._host.type.schema is not None  # type: ignore
        ):
            assert schema is None, "Cannot pass schema if record.type has schema."
            schema = self._host.type.schema  # type: ignore
        if host_is_artifact:
            schema = self._get_external_schema()
        if schema is not None:
            ExperimentalDictCurator(dictionary, schema).validate()
            member_ids = set(schema.members.values_list("id", flat=True))
            features_not_in_schema = [
                feature.name
                for feature in explicit_features
                if feature.id not in member_ids
            ]
            if features_not_in_schema:
                raise ValidationError(
                    "These feature keys are not in the provided schema: "
                    f"{features_not_in_schema}"
                )
            looked_up_features = schema.members.filter(name__in=keys)
            feature_objects = self._merge_feature_objects(
                explicit_features, looked_up_features
            )
        else:
            if string_key_values:
                looked_up_features = self._get_feature_objects(
                    string_key_values, feature_field
                )
            else:
                looked_up_features = Feature.objects.none()
            feature_objects = self._merge_feature_objects(
                explicit_features, looked_up_features
            )
        self._remove_values()
        self._add_values(
            feature_objects,
            dictionary=dictionary,
            values_by_feature_uid=values_by_feature_uid,
        )

    def _get_external_schema(self) -> Schema | None:
        external_schema = None
        if self._host.otype is None:
            external_schema = self._host.schema
        elif self._host.schema is not None:
            external_schema = self._host.schema.slots.get("__external__", None)
        return external_schema

    def remove_values(
        self,
        feature: (
            str | Feature | list[str | Feature] | dict[str | Feature, Any | None] | None
        ) = None,
        *,
        value: Any | None = None,
    ) -> None:
        """Remove values for features.

        Args:
            feature: Indicate one or several features for which to remove values.
                If `None`, values for all external features will be removed.
                Also supports a dictionary mapping feature keys to values to remove,
                e.g. `{feature: value}`.
            value: An optional value to restrict removal to a single value.
        """
        host_name = self._host.__class__.__name__.lower()
        host_is_artifact = host_name == "artifact"

        if host_is_artifact:
            external_schema = self._get_external_schema()
            if external_schema is not None:
                raise ValueError(
                    "Cannot remove values if artifact has external schema."
                )
        return self._remove_values(
            feature,
            value=value,
        )

    def _remove_values(
        self,
        feature: (
            str | Feature | list[str | Feature] | dict[str | Feature, Any | None] | None
        ) = None,
        *,
        value: Any | None = None,
    ) -> None:
        from django.apps import apps

        host_name = self._host.__class__.__name__.lower()
        host_is_record = host_name == "record"
        host_is_artifact = host_name == "artifact"

        if isinstance(feature, dict):
            if value is not None:
                raise ValueError(
                    "Pass either `value=` or per-feature values via a dictionary, not both."
                )
            for one_feature, one_value in feature.items():
                self._remove_values(one_feature, value=one_value)
            return
        if feature is None:
            features = get_features_data(
                self._host, to_dict=True, external_only=True
            ).keys()
        elif not isinstance(feature, list):
            features = [feature]
        else:
            features = feature
        for feature in features:
            if isinstance(feature, str):
                feature_record = Feature.get(name=feature)
            else:
                feature_record = feature
                if feature_record._state.adding:
                    raise ValidationError(
                        f"Please save feature '{feature_record.name}' before annotation."
                    )
                if (
                    self._host._state.db is not None
                    and feature_record._state.db != self._host._state.db
                ):
                    feature_record = Feature.connect(self._host._state.db).get(
                        uid=feature_record.uid
                    )
            if host_is_artifact:
                for schema in self.slots.values():
                    if feature_record in schema.members:
                        raise ValueError("Cannot remove values for dataset features.")
            filter_kwargs = {"feature": feature_record}
            none_message = f"with value {value!r} " if value is not None else ""
            if feature_record._dtype_str.startswith(("cat[", "list[cat")):  # type: ignore
                feature_registry = parse_dtype(feature_record._dtype_str)[0][
                    "registry_str"
                ]
                if "." in feature_registry:
                    parts = feature_registry.split(".")
                    app_label = parts[0]
                    entity_name = parts[-1]
                else:
                    app_label = "lamindb"
                    entity_name = feature_registry
                host_name = self._host.__class__.__name__
                link_model_name = f"{host_name}{entity_name}"
                link_model = apps.get_model(app_label, link_model_name)
                filter_kwargs[host_name.lower()] = self._host
                if value is not None:
                    if not isinstance(value, SQLRecord):
                        raise TypeError(
                            f"Expected a record for removing categorical feature value, "
                            f"got {value} of type {type(value)}"
                        )
                    assert not host_is_record, "Only artifacts support passing a value."
                    filter_kwargs[entity_name.lower()] = value
                link_records = link_model.objects.filter(**filter_kwargs)
                if not link_records.exists():
                    value_msg = f"with value {value!r} " if value is not None else ""
                    logger.warning(
                        f"no feature '{feature_record.name}' {value_msg}found on "
                        f"{host_name.lower()} '{self._host.uid}'!"
                    )
                    return
                link_records.delete()
            else:
                if value is not None:
                    filter_kwargs["value"] = value
                if host_is_record:
                    feature_values = self._host.values_json.filter(**filter_kwargs)
                else:
                    feature_values = self._host.json_values.filter(**filter_kwargs)
                if not feature_values.exists():
                    logger.warning(
                        f"no feature '{feature_record.name}' {none_message}found on {self._host.__class__.__name__.lower()} '{self._host.uid}'!"
                    )
                    return
                if host_is_record:
                    feature_values.delete(permanent=True)
                else:
                    # the below might leave a dangling feature_value record
                    # but we don't want to pay the price of making another query just to remove this annotation
                    # we can clean the JsonValue registry periodically if we want to
                    self._host.json_values.remove(*feature_values)

    def _add_schema(self, schema: Schema, slot: str) -> None:
        """Annotate artifact with a schema.

        Args:
            schema: `Schema` A schema record.
            slot: `str` The slot that marks where the schema is stored in
                the artifact.
        """
        # TODO: deprecate as soon as we have the Schema-based curators
        if self._host._state.adding:
            raise ValueError(
                "Please save the artifact or collection before adding a feature set!"
            )
        host_db = self._host._state.db
        schema.save(using=host_db)
        kwargs = {
            "artifact_id": self._host.id,
            "schema": schema,
            "slot": slot,
        }
        link_record = (
            self._host.schemas.through.objects.using(host_db)
            .filter(**kwargs)
            .one_or_none()
        )
        if link_record is None:
            self._host.schemas.through(**kwargs).save(using=host_db)
            if slot in self.slots:
                logger.debug(f"replaced existing {slot} feature set")
            self._slots[slot] = schema  # type: ignore

    def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
        """Transfer features from a artifact or collection."""
        # This only covers feature sets
        if transfer_logs is None:
            transfer_logs = {"mapped": [], "transferred": [], "run": None}
        from lamindb import settings

        using_key = settings._using_key
        for slot, schema in data.features.slots.items():  # type: ignore
            try:
                members = schema.members
            except ModuleWasntConfigured as err:
                logger.warning(f"skipping transfer of {slot} schema because {err}")
                continue
            if len(members) == 0:
                continue
            if len(members) > settings.annotation.n_max_records:
                logger.warning(
                    f"skipping creating {len(members)} > {settings.annotation.n_max_records} new {members[0].__class__.__name__} records"
                )
                schema_self = schema
                schema_exists = Schema.filter(hash=schema_self.hash).one_or_none()
                if schema_exists is not None:
                    schema_self = schema_exists
                else:
                    schema_self.save()
            else:
                registry = members[0].__class__
                # note here the features are transferred based on an unique field
                field = REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid")
                # this will be e.g. be a list of ontology_ids or uids
                member_uids = list(members.values_list(field, flat=True))
                validated = registry.validate(member_uids, field=field, mute=True)
                new_members_uids = list(compress(member_uids, ~validated))
                new_members = members.filter(**{f"{field}__in": new_members_uids})
                n_new_members = len(new_members)
                if len(members) > settings.annotation.n_max_records:
                    logger.warning(
                        f"skipping creating {n_new_members} > {settings.annotation.n_max_records} new {registry.__name__} records"
                    )
                if n_new_members > 0:
                    # transfer foreign keys needs to be run before transfer to default db
                    transfer_fk_to_default_db_bulk(
                        new_members, using_key, transfer_logs=transfer_logs
                    )
                    for feature in new_members:
                        # not calling save=True here as in labels, because want to
                        # bulk save below
                        # transfer_fk is set to False because they are already transferred
                        # in the previous step transfer_fk_to_default_db_bulk
                        transfer_to_default_db(
                            feature,
                            using_key,
                            transfer_fk=False,
                            transfer_logs=transfer_logs,
                        )
                    save(
                        new_members, ignore_conflicts=True
                    )  # conflicts arising from existing records are ignored

                # create a new feature set from feature values using the same uid
                schema_self = Schema.from_values(
                    member_uids, field=getattr(registry, field)
                )
                if schema_self is None:
                    if hasattr(registry, "organism_id"):
                        logger.warning(
                            f"Schema is not transferred, check if organism is set correctly: {schema}"
                        )
                    continue
                # make sure the uid matches if schema is composed of same features
                if schema_self.hash == schema.hash:
                    schema_self.uid = schema.uid
                logger.info(f"saving {slot} schema: {schema_self}")
            try:
                self._host.features._add_schema(schema_self, slot)
            except IntegrityError:
                logger.warning(
                    f"updating annotation of artifact {self._host.uid} with feature set for slot: {slot}"
                )
                self._host.schemas.through.objects.get(
                    artifact_id=self._host.id, slot=slot
                ).delete()
                self._host.features._add_schema(schema_self, slot)


def bulk_set_features_in_records(records: Iterable[Record]) -> None:
    """Bulk-set lazy feature dictionaries for records.

    Intended for records created via `Record(features=...)` and persisted with
    `ln.save([...])`.
    """
    import pandas as pd

    from lamindb.curators.core import DataFrameCurator

    records_with_features = [
        record
        for record in records
        if hasattr(record, "_features") and record._features is not None
    ]
    if len(records_with_features) == 0:
        return None

    batch_schema: Schema | None = None
    prepared_records: list[
        tuple[Record, FeatureManager, dict[str, Any], list[Feature], dict[str, Any]]
    ] = []
    prepared_rows: list[dict[str, Any]] = []
    for record in records_with_features:
        schema = None
        if record.type is not None and record.type.schema is not None:
            schema = record.type.schema
        if schema is None:
            raise ValidationError(
                "Bulk setting features in records requires all records to have the same non-null type schema."
            )
        if batch_schema is None:
            batch_schema = schema
        elif schema.id != batch_schema.id:
            raise ValidationError(
                "Bulk setting features in records requires all records to have the same type schema."
            )
        manager = record.features
        (
            dictionary,
            _,
            explicit_features,
            values_by_feature_uid,
        ) = manager._resolve_feature_value_dictionary(record._features)
        prepared_rows.append(dictionary)
        prepared_records.append(
            (record, manager, dictionary, explicit_features, values_by_feature_uid)
        )

    assert batch_schema is not None  # noqa: S101
    schema_features = list(batch_schema.members.all())
    dataframe = pd.DataFrame(prepared_rows)
    for feature in schema_features:
        if (
            feature.name in dataframe
            and feature.dtype_as_str.startswith("cat")
            and not feature.dtype_as_str.startswith("list[cat")
        ):
            dataframe[feature.name] = dataframe[feature.name].astype("category")
    # Single-pass dataframe curation:
    # validate schema and resolve categoricals once for the entire batch.
    #
    # The resolved label records are then reused below when creating per-record
    # link rows, avoiding repeated registry calls for each row.
    curator = DataFrameCurator(dataframe, batch_schema)
    curator.validate()

    members_by_name: dict[str, list[Feature]] = defaultdict(list)
    schema_member_ids: set[int] = set()
    resolved_records_by_feature_id: dict[int, dict[Any, list[SQLRecord]]] = {}
    for feature in schema_features:
        members_by_name[feature.name].append(feature)
        schema_member_ids.add(feature.id)
        if not (
            feature.dtype_as_str.startswith("cat")
            or feature.dtype_as_str.startswith("list[cat")
        ):
            continue
        cat_vector = curator.cat._cat_vectors.get(feature.name)
        if cat_vector is None or cat_vector.records is None:
            continue
        # Build lookup cache:
        #   feature.id -> raw value -> [resolved label records]
        #
        # We intentionally keep a list of records per value to support
        # list-categorical and potential multi-match cases consistently with
        # existing link creation semantics.
        cache_for_feature: dict[Any, list[SQLRecord]] = defaultdict(list)
        for label_record in cat_vector.records:
            key = getattr(label_record, cat_vector._field_name)
            normalized_key = key.item() if isinstance(key, np.generic) else key
            cache_for_feature[normalized_key].append(label_record)
        resolved_records_by_feature_id[feature.id] = dict(cache_for_feature)

    feature_json_values: list[SQLRecord] = []
    links_by_model: dict[type[SQLRecord], list[SQLRecord]] = defaultdict(list)
    not_validated_values: dict[str, tuple[str, list[str]]] = {}
    for (
        record,
        manager,
        dictionary,
        explicit_features,
        values_by_feature_uid,
    ) in prepared_records:
        keys = list(dictionary.keys())
        features_not_in_schema = [
            feature.name
            for feature in explicit_features
            if feature.id not in schema_member_ids
        ]
        if features_not_in_schema:
            raise ValidationError(
                "These feature keys are not in the provided schema: "
                f"{features_not_in_schema}"
            )
        looked_up_features = [
            feature for key in keys for feature in members_by_name.get(key, [])
        ]
        feature_objects = manager._merge_feature_objects(
            explicit_features, looked_up_features
        )
        manager._collect_record_feature_writes(
            record=record,
            feature_objects=feature_objects,
            dictionary=dictionary,
            values_by_feature_uid=values_by_feature_uid,
            feature_json_values=feature_json_values,
            links_by_model=links_by_model,
            not_validated_values=not_validated_values,
            resolved_records_by_feature_id=resolved_records_by_feature_id,
        )
    FeatureManager._raise_not_validated_values(not_validated_values)
    if feature_json_values:
        save(feature_json_values)
    for links in links_by_model.values():
        try:
            save(links, ignore_conflicts=False)
        except Exception:
            save(links, ignore_conflicts=True)
    for record in records_with_features:
        del record._features
    return None


================================================
FILE: lamindb/models/_from_values.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from lamin_utils import colors, logger

if TYPE_CHECKING:
    from pandas import DataFrame, Index

    from lamindb.base.types import FieldAttr, ListLike

    from .query_set import SQLRecordList
    from .sqlrecord import SQLRecord


# The base function for `from_values`
def _from_values(
    iterable: ListLike,
    field: FieldAttr,
    *,
    create: bool = False,
    organism: SQLRecord | str | None = None,
    source: SQLRecord | None = None,
    standardize: bool = True,
    from_source: bool = True,
    mute: bool = False,
    **filter_kwargs,
) -> SQLRecordList:
    """Get or create records from iterables."""
    from .query_set import SQLRecordList

    registry = field.field.model  # type: ignore
    organism_record = get_organism_record_from_field(field, organism, values=iterable)
    # TODO: the create is problematic if field is not a name field
    if create:
        create_kwargs = {}
        if organism_record:
            create_kwargs["organism"] = organism_record
        return SQLRecordList(
            [
                registry(**{field.field.name: value}, **create_kwargs)
                for value in iterable
            ]
        )  # type: ignore

    iterable_idx = index_iterable(iterable)

    # returns existing records & non-existing values
    records, nonexist_values, msg = get_existing_records(
        iterable_idx=iterable_idx,
        field=field,
        organism=organism_record,
        mute=mute,
        **filter_kwargs,
    )

    # new records to be created based on new values
    if len(nonexist_values) > 0:
        if from_source and registry.__base__.__name__ == "BioRecord":
            # if can and needed, get organism record from the existing records
            if (
                organism_record is None
                and len(records) > 0
                and registry.require_organism()
            ):
                organism_record = records[0].organism
            records_public, unmapped_values = create_records_from_source(
                iterable_idx=nonexist_values,
                field=field,
                organism=organism_record,
                source=source,
                standardize=standardize,
                msg=msg,
                mute=mute,
            )
            if len(records_public) > 0:
                msg = ""
            for record in records_public:
                record._from_source = True
            records += records_public
        else:
            unmapped_values = nonexist_values
        # unmapped new_ids will NOT create records
        if len(unmapped_values) > 0:
            # first log the success message
            if len(msg) > 0 and not mute:
                logger.success(msg)
            s = "" if len(unmapped_values) == 1 else "s"
            print_values = colors.yellow(_format_values(unmapped_values))
            n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
            if not mute:
                logger.info(
                    f"{colors.red('did not create')} {registry.__name__} record{s} for "
                    f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"  # type: ignore
                )
    return SQLRecordList(records)


def get_existing_records(
    iterable_idx: Index,
    field: FieldAttr,
    organism: SQLRecord | None = None,
    standardize: bool = True,
    mute: bool = False,
    **filter_kwargs,
) -> tuple[list, Index, str]:
    """Get existing records from the database."""
    import pandas as pd

    from .can_curate import _validate

    # NOTE: existing records matching is agnostic to the source
    registry = field.field.model  # type: ignore
    queryset = registry.filter(**filter_kwargs)

    if standardize:
        # log synonyms mapped terms
        if hasattr(registry, "standardize"):
            syn_mapper = queryset.standardize(
                iterable_idx,
                field=field,
                organism=organism,
                mute=True,
                from_source=False,  # standardize only based on the DB reference
                return_mapper=True,
            )
            iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
    else:
        syn_mapper = {}

    # now we have to sort the list of queried records
    # preserved = Case(
    #     *[
    #         When(**{field.field.name: value}, then=pos)
    #         for pos, value in enumerate(iterable_idx)
    #     ]
    # )
    # order by causes a factor 10 in runtime
    # records = query_set.order_by(preserved).to_list()

    # log validated terms
    is_validated = _validate(
        cls=queryset, values=iterable_idx, field=field, organism=organism, mute=True
    )
    if len(is_validated) > 0:
        validated = iterable_idx[is_validated]
    else:
        validated = []
    msg = ""
    syn_msg = ""
    if not mute:
        if len(validated) > 0:
            s = "" if len(validated) == 1 else "s"
            print_values = colors.green(_format_values(validated))
            msg = (
                "loaded"
                f" {colors.green(f'{len(validated)} {registry.__name__} record{s}')}"
                f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
            )
        if len(syn_mapper) > 0:
            s = "" if len(syn_mapper) == 1 else "s"
            names = list(syn_mapper.keys())
            print_values = colors.green(_format_values(names))
            syn_msg = (
                "loaded"
                f" {colors.green(f'{len(syn_mapper)} {registry.__name__} record{s}')}"
                f" matching {colors.italic('synonyms')}: {print_values}"
            )

    # no logging if all values are validated
    # logs if there are synonyms
    if len(syn_msg) > 0:
        if len(msg) > 0 and not mute:
            logger.success(msg)
        if not mute:
            logger.success(syn_msg)
        msg = ""

    # get all existing records in the db
    query = {f"{field.field.name}__in": iterable_idx.values}  # type: ignore
    if organism is not None:
        query["organism"] = organism
    records = queryset.filter(**query).to_list()

    if len(validated) == len(iterable_idx):
        return records, pd.Index([]), msg
    else:
        nonval_values = iterable_idx.difference(validated)
        return records, nonval_values, msg


def create_records_from_source(
    iterable_idx: Index,
    field: FieldAttr,
    organism: SQLRecord | None = None,
    source: SQLRecord | None = None,
    standardize: bool = True,
    msg: str = "",
    mute: bool = False,
) -> tuple[list, Index]:
    """Create records from source."""
    registry = field.field.model  # type: ignore
    records: list = []
    # populate additional fields from public_df
    from bionty._organism import OrganismNotSet
    from bionty._source import filter_public_df_columns, get_source_record

    # get the default source
    if organism is None and registry.require_organism(field=field):
        raise OrganismNotSet(
            f"`organism` is required to create new {registry.__name__} records from source!"
        )
    try:
        source_record = get_source_record(registry, organism, source)
    except ValueError:
        # no source found
        return records, iterable_idx

    # create the corresponding PublicOntology object from registry
    try:
        public_ontology = registry.public(source=source_record)
    except Exception:
        # no public source
        return records, iterable_idx

    # filter the columns in public df based on fields
    public_df = filter_public_df_columns(
        registry=registry, public_ontology=public_ontology
    )

    if public_df.empty:
        return records, iterable_idx

    # standardize in the public reference
    # do not inspect synonyms if the field is not name field
    result = public_ontology.inspect(
        iterable_idx,
        field=field.field.name,  # type: ignore
        standardize=False
        if hasattr(registry, "_name_field") and field.field.name != registry._name_field
        else standardize,  # type: ignore
        mute=True,
    )
    syn_mapper = result.synonyms_mapper

    msg_syn: str = ""
    if len(syn_mapper) > 0:
        s = "" if len(syn_mapper) == 1 else "s"
        names = list(syn_mapper.keys())
        print_values = colors.purple(_format_values(names))
        msg_syn = (
            "created"
            f" {colors.purple(f'{len(syn_mapper)} {registry.__name__} record{s} from Bionty')}"
            f" matching {colors.italic('synonyms')}: {print_values}"
        )

        iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index

    # create records for values that are found in the public reference
    # matching either field or synonyms
    mapped_values = iterable_idx.intersection(public_df[field.field.name])  # type: ignore

    multi_msg = ""
    if len(mapped_values) > 0:
        public_kwargs, multi_msg = _bulk_create_dicts_from_df(
            keys=mapped_values,
            column_name=field.field.name,  # type: ignore
            df=public_df,
        )

        create_kwargs = (
            {"organism": organism, "source": source_record}
            if organism is not None
            else {"source": source_record}
        )
        for bk in public_kwargs:
            # skip validation to speed up bulk creation since the values don't validate in the registry DB yet
            records.append(registry(**bk, **create_kwargs, _skip_validation=True))

        # number of records that matches field (not synonyms)
        validated = result.validated
        if len(validated) > 0:
            s = "" if len(validated) == 1 else "s"
            print_values = colors.purple(_format_values(validated))
            # this is the success msg for existing records in the DB from get_existing_records
            if len(msg) > 0 and not mute:
                logger.success(msg)
            if not mute:
                logger.success(
                    "created"
                    f" {colors.purple(f'{len(validated)} {registry.__name__} record{s} from Bionty')}"
                    f" matching {colors.italic(f'{field.field.name}')}: {print_values}"  # type: ignore
                )

    # make sure that synonyms logging appears after the field logging
    if len(msg_syn) > 0 and not mute:
        logger.success(msg_syn)
    # warning about multi matches
    if len(multi_msg) > 0 and not mute:
        logger.warning(multi_msg)

    # return the values that are not found in the public reference
    unmapped_values = iterable_idx.difference(mapped_values)
    return records, unmapped_values


def index_iterable(iterable: ListLike) -> Index:
    """Get unique values from an iterable."""
    import pandas as pd

    idx = pd.Index(iterable).unique()
    # No entries are made for NAs, '', None
    # returns an ordered unique not null list
    return idx[(idx != "") & (~idx.isnull())]


def _format_values(
    names: ListLike, n: int = 20, quotes: bool = True, sep: str = "'"
) -> str:
    """Format values for printing."""
    items = {str(name): None for name in names if name != "None"}

    unique_items = list(items.keys())

    if quotes:
        unique_items = [f"{sep}{item}{sep}" for item in unique_items]

    print_values = ", ".join(unique_items[:n])

    if len(unique_items) > n:
        print_values += ", ..."

    return print_values


def _bulk_create_dicts_from_df(
    keys: set | list, column_name: str, df: DataFrame
) -> tuple[dict, str]:
    """Get fields from a DataFrame for many rows."""
    multi_msg = ""
    if df.index.name != column_name:
        df = df.set_index(column_name).loc[list(keys)]
    if not df.index.is_unique:
        # return all records for multi-matches with a warning
        dup = df.index[df.index.duplicated()].unique().tolist()
        if len(dup) > 0:
            s = "" if len(dup) == 1 else "s"
            print_values = _format_values(dup)
            multi_msg = (
                f"ambiguous validation in Bionty for {len(dup)} record{s}:"
                f" {print_values}"
            )

    return df.reset_index().to_dict(orient="records"), multi_msg


def get_organism_record_from_field(  # type: ignore
    field: FieldAttr,
    organism: str | SQLRecord | None = None,
    values: ListLike = None,
    using_key: str | None = None,
) -> SQLRecord | None:
    """Get organism record based on which field is used in from_values.

    Args:
        field: the field of the registry for from_values
        organism: the organism to get the organism record for
        values: the values passed to from_values
        using_key: the db to get the organism record from

    Returns:
        The organism record if both conditions are met:
            The organism FK is required for the registry
            The field is not unique (e.g. Gene.symbol) or the organism is not None
    """
    registry = field.field.model
    if registry.__base__.__name__ != "BioRecord":
        return None

    from bionty._organism import (
        create_or_get_organism_record,
        infer_organism_from_ensembl_id,
    )

    if values is None:
        values = []

    # if the field is bionty.Gene.ensembl_gene_id, infer organism from ensembl id
    if (
        registry.__get_name_with_module__() == "bionty.Gene"
        and field.field.name == "ensembl_gene_id"
        and len(values) > 0
        and organism is None
    ):
        # Check if values contain bionty.Gene objects with organism field
        from collections.abc import Iterable

        # first check if we have Gene objects
        for v in values:
            # early return to not loop through all values to find a string
            if isinstance(v, str):
                break
            if isinstance(v, registry) and v.organism is not None:
                return v.organism
            # Handle iterables containing Gene objects (but not strings, which are also iterable)
            elif isinstance(v, Iterable) and not isinstance(v, str):
                for item in v:
                    if isinstance(item, registry) and item.organism is not None:
                        return item.organism

        # If no bionty.Gene with organism found, fall back to string-based inference
        # pass the first ensembl id that starts with ENS to infer organism
        first_ensembl = next(
            (v for v in values if isinstance(v, str) and v.startswith("ENS")), ""
        )
        if first_ensembl:
            return infer_organism_from_ensembl_id(first_ensembl, using_key)

    return create_or_get_organism_record(
        organism=organism, registry=registry, field=field
    )


================================================
FILE: lamindb/models/_is_versioned.py
================================================
from __future__ import annotations

from pathlib import PurePosixPath
from typing import TYPE_CHECKING, Any, Iterable, Literal

from django.db import models
from django.db.models import Q
from lamin_utils import logger
from lamin_utils._base62 import increment_base62

from lamindb.base import uids
from lamindb.base.fields import (
    BooleanField,
    CharField,
)

if TYPE_CHECKING:  # noqa
    from lamindb.models.query_set import QuerySet


class IsVersioned(models.Model):
    """Base class for versioned models."""

    class Meta:
        abstract = True

    _len_stem_uid: int

    version_tag: str | None = CharField(max_length=30, null=True, db_index=True)
    """Version tag (default `None`).

    Consider using `semantic versioning <https://semver.org>`__
    with `Python versioning <https://peps.python.org/pep-0440/>`__.
    """
    is_latest: bool = BooleanField(default=True, db_index=True)
    """Boolean flag that indicates whether a record is the latest in its version family."""

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        self._revises = kwargs.pop("revises", None)
        super().__init__(*args, **kwargs)

    @property
    def stem_uid(self) -> str:
        """Universal id characterizing the version family.

        The full uid of a record is obtained via concatenating the stem uid and version information::

            stem_uid = random_base62(n_char)  # a random base62 sequence of length 12 (transform) or 16 (artifact, collection)
            version_uid = "0000"  # an auto-incrementing 4-digit base62 number
            uid = f"{stem_uid}{version_uid}"  # concatenate the stem_uid & version_uid

        """
        return self.uid[: self._len_stem_uid]  # type: ignore

    @property
    def version(self) -> str:
        """The version of an object.

        Defines version of an object within a family of objects characterized by the same `stem_uid`.

        Returns `.version_tag` if set, otherwise the last 4 characters of the `uid`.
        """
        return self.version_tag if self.version_tag else self.uid[-4:]  # type: ignore

    @version.setter
    def version(self, value: str | None) -> None:
        self.version_tag = value

    @property
    def versions(self) -> QuerySet:
        """Lists all records of the same version family.

        Example::

            artifact.versions.to_dataframe()       # all versions of the artifact in a dataframe
            artifact.versions.get(is_latest=True)  # the latest version of the artifact
        """
        return (
            self.__class__.connect(self._state.db)
            .filter(uid__startswith=self.stem_uid)
            .order_by("-created_at")
        )

    def _add_to_version_family(
        self, revises: IsVersioned, version_tag: str | None = None
    ):
        """Add current record to a version family.

        Args:
            revises: a record that belongs to the version family.
            version_tag: semantic version tag of the record.
        """
        old_uid = self.uid  # type: ignore
        new_uid, revises = create_uid(revises=revises, version_tag=version_tag)
        if (
            self.__class__.__name__ == "Artifact"
            and self._real_key is None
            and (self._key_is_virtual or self.key is None)
        ):
            from lamindb.core.storage.paths import auto_storage_key_from_artifact_uid

            old_path = self.path
            new_storage_key = auto_storage_key_from_artifact_uid(
                new_uid, self.suffix, self._overwrite_versions
            )
            new_path = old_path.rename(
                old_path.with_name(PurePosixPath(new_storage_key).name)
            )
            logger.success(f"updated path from {old_path} to {new_path}!")
        self.uid = new_uid
        self.version_tag = version_tag
        self.save()
        logger.success(f"updated uid from {old_uid} to {new_uid}!")


def bump_version(
    version: str,
    bump_type: str = "minor",
    behavior: Literal["prompt", "error", "ignore"] = "error",
) -> str:
    """Bumps the version number by major or minor depending on the bump_type flag.

    Args:
        version: The current version in "MAJOR" or "MAJOR.MINOR" format.
        bump_type: The type of version bump, either 'major' or 'minor'.

    Returns:
        The new version string.
    """
    try:
        # Split the version into major and minor parts if possible
        parts = version.split(".")
        major = int(parts[0])
        minor = int(parts[1]) if len(parts) > 1 else 0

        if bump_type == "major":
            # Bump the major version and reset the minor version
            new_version = f"{major + 1}"
        elif bump_type == "minor":
            # Bump the minor version
            new_version = f"{major}.{minor + 1}"
        else:
            raise ValueError("bump_type must be 'major' or 'minor'")

    except (ValueError, IndexError):
        if behavior == "prompt":
            new_version = input(
                f"The current version is '{version}' - please type the new version: "
            )
        elif behavior == "error":
            raise ValueError(
                "Cannot auto-increment non-integer castable version, please provide"
                " manually"
            ) from None
        else:
            logger.warning("could not auto-increment version, fix '?' manually")
            new_version = "?"
    return new_version


def set_version(version: str | None = None, previous_version: str | None = None):
    """(Auto-) set version.

    If `version` is `None`, returns the stored version.
    Otherwise sets the version to the passed version.

    Args:
        version: Version string.
        previous_version: Previous version string.
    """
    if version is None and previous_version is not None:
        version = bump_version(previous_version, bump_type="major")
    return version


def create_uid(
    *,
    version_tag: str | None = None,
    n_full_id: int = 20,
    revises: IsVersioned | None = None,
) -> tuple[str, IsVersioned | None]:
    """This also updates revises in case it's not the latest version.

    This is why it returns revises.
    """
    if revises is not None:
        latest_in_family = (
            revises.__class__.objects.filter(uid__startswith=revises.stem_uid)
            .order_by("uid")
            .last()
        )
        if latest_in_family is not None and latest_in_family.uid != revises.uid:
            revises = latest_in_family
            logger.warning(
                f"didn't pass the latest version in `revises`, retrieved it: {revises}"
            )
        suid = revises.stem_uid
        vuid = increment_base62(revises.uid[-4:])  # type: ignore
    else:
        suid = uids.base62(n_full_id - 4)
        vuid = "0000"
    if version_tag is not None:
        if not isinstance(version_tag, str):
            raise ValueError(
                "`version` parameter must be `None` or `str`, e.g., '0.1', '1', '2', etc."
            )
        if revises is not None:
            if version_tag == revises.version_tag:
                raise ValueError(
                    f"Please change the version tag or leave it `None`, '{revises.version_tag}' is already taken"
                )
    return suid + vuid, revises


def process_revises(
    revises: IsVersioned | None,
    version_tag: str | None,
    key: str | None,
    description: str | None,
    type: type[IsVersioned],
) -> tuple[str, str, str, str, IsVersioned | None]:
    if revises is not None and not isinstance(revises, type):
        raise TypeError(f"`revises` has to be of type `{type.__name__}`")
    uid, revises = create_uid(
        revises=revises, version_tag=version_tag, n_full_id=type._len_full_uid
    )
    if revises is not None:
        if description is None:
            description = getattr(revises, "description", None)
        if key is None:
            key = revises.key
    return uid, version_tag, key, description, revises


def _adjust_is_latest_when_deleting_is_versioned(
    objects: IsVersioned | Iterable[IsVersioned],
) -> list[int]:
    """After deleting (soft or permanent) versioned records, promote new latest per version family.

    Accepts a single IsVersioned instance, a QuerySet, or a list of IsVersioned.
    Runs in 1 query (candidates + update) when objects are passed; no extra query for uids.
    Returns the list of pks that were promoted to is_latest (for testing).
    """
    if isinstance(objects, IsVersioned):
        objects = [objects]
    else:
        objects = list(objects)
    if not objects:
        return []
    id_list = [o.pk for o in objects]
    stem_uids = list({o.uid[: o._len_stem_uid] for o in objects if o.is_latest})
    if not stem_uids:
        return []
    registry = type(objects[0])
    db = getattr(objects[0]._state, "db", None) or "default"
    len_stem = registry._len_stem_uid
    # All candidates: same family as any stem_uid, not in trash and not about to be deleted
    q = Q()
    for s in stem_uids:
        q |= Q(uid__startswith=s)
    qs = registry.objects.using(db).filter(q).exclude(pk__in=id_list)
    from .sqlrecord import SQLRecord

    if issubclass(registry, SQLRecord):
        qs = qs.exclude(branch_id=-1)
    candidates = list(qs.values("pk", "uid", "created_at"))
    # per stem_uid, pick candidate with max created_at
    by_stem: dict[str, dict[str, Any]] = {}
    for c in candidates:
        stem = c["uid"][:len_stem]
        if stem not in by_stem or c["created_at"] > by_stem[stem]["created_at"]:
            by_stem[stem] = c
    if not by_stem:
        return []
    pks = [by_stem[s]["pk"] for s in by_stem]
    registry.objects.using(db).filter(pk__in=pks).update(is_latest=True)
    if pks:
        promoted_uids = [by_stem[s]["uid"] for s in by_stem]
        if len(promoted_uids) == 1:
            logger.important_hint(
                f"new latest {registry.__name__} version is: {promoted_uids[0]}"
            )
        else:
            logger.important_hint(
                f"new latest {registry.__name__} versions: {promoted_uids}"
            )
    return pks


def reconcile_is_latest_within_branch(
    registry: type[IsVersioned],
    *,
    branch_id: int,
    db: str = "default",
) -> int:
    """Keep a single is_latest=True per version family in a branch.

    Winner selection is based on newest created_at, tie-broken by highest pk.
    Returns the number of records demoted from is_latest=True to False.
    """
    len_stem = registry._len_stem_uid
    latest_records = list(
        registry.objects.using(db)
        .filter(branch_id=branch_id, is_latest=True)
        .values("pk", "uid", "created_at")
        .order_by("uid", "created_at", "pk")
    )
    if not latest_records:
        return 0
    winners_by_stem: dict[str, dict[str, Any]] = {}
    losers: list[int] = []
    for record in latest_records:
        stem = record["uid"][:len_stem]
        winner = winners_by_stem.get(stem)
        if winner is None:
            winners_by_stem[stem] = record
            continue
        if (record["created_at"], record["pk"]) > (winner["created_at"], winner["pk"]):
            losers.append(winner["pk"])
            winners_by_stem[stem] = record
        else:
            losers.append(record["pk"])
    if not losers:
        return 0
    return registry.objects.using(db).filter(pk__in=losers).update(is_latest=False)


================================================
FILE: lamindb/models/_label_manager.py
================================================
from __future__ import annotations

from collections import defaultdict
from typing import TYPE_CHECKING

from django.db import connections
from rich.table import Column, Table
from rich.text import Text
from rich.tree import Tree

from lamindb.models import CanCurate, Feature
from lamindb.models._from_values import _format_values
from lamindb.models.save import save
from lamindb.models.sqlrecord import (
    REGISTRY_UNIQUE_FIELD,
    get_name_field,
    transfer_fk_to_default_db_bulk,
    transfer_to_default_db,
)

from ._describe import (
    NAME_WIDTH,
    TYPE_WIDTH,
    VALUES_WIDTH,
    format_rich_tree,
)
from ._django import get_artifact_or_run_with_related, get_related_model
from ._relations import dict_related_model_to_related_name

if TYPE_CHECKING:
    from lamindb.models import Artifact, Collection, SQLRecord
    from lamindb.models.query_set import QuerySet

EXCLUDE_LABELS = {"schemas"}


def _get_labels(
    obj, links: bool = False, instance: str | None = None
) -> dict[str, QuerySet]:
    """Get all labels associated with an object as a dictionary.

    This is a generic approach that uses django orm.
    """
    if obj.id is None:
        return {}

    labels = {}
    related_models = dict_related_model_to_related_name(
        obj.__class__, links=links, instance=instance
    )
    if obj.__class__.__name__ == "Artifact" and links:
        related_models["ArtifactArtifact"] = "links_artifact"
    for _, related_name in related_models.items():
        if (
            related_name not in EXCLUDE_LABELS
            and not related_name.startswith("_")
            and not related_name == "json_values"
        ):
            labels[related_name] = getattr(obj, related_name).all()
    return labels


def _get_labels_postgres(
    self: Artifact | Collection, m2m_data: dict | None = None
) -> dict[str, dict[int, str]]:
    """Get all labels associated with an artifact or collection as a dictionary.

    This is a postgres-specific approach that uses django Subquery.
    """
    if m2m_data is None:
        artifact_meta = get_artifact_or_run_with_related(self, include_m2m=True)
        m2m_data = artifact_meta.get("related_data", {}).get("m2m", {})
    return m2m_data


def describe_labels(
    self: Artifact | Collection,
    related_data: dict | None = None,
) -> Tree | None:
    """Describe labels."""
    labels_data = related_data.get("m2m") if related_data is not None else None
    if labels_data is None:
        if (
            not self._state.adding
            and connections[self._state.db].vendor == "postgresql"
        ):
            labels_data = _get_labels_postgres(self, labels_data)
        if not labels_data:
            labels_data = _get_labels(self, instance=self._state.db)
    if not labels_data:
        return None
    labels_table = Table(
        Column("", style="", no_wrap=True, width=NAME_WIDTH),
        Column("", style="dim", no_wrap=True, width=TYPE_WIDTH),
        Column("", width=VALUES_WIDTH, no_wrap=True),
        show_header=False,
        box=None,
        pad_edge=False,
    )
    for related_name, labels in labels_data.items():
        if not labels or related_name == "schemas":
            continue
        if isinstance(labels, dict):
            displays = [
                d[key]
                for d in labels.values()
                for key in d.keys()
                if key.endswith("_display")
            ]
            print_values = _format_values(displays, n=10, quotes=False)
        else:  # labels are a QuerySet
            field = get_name_field(labels)
            print_values = _format_values(
                labels.values_list(field, flat=True), n=10, quotes=False
            )
        if print_values:
            related_model = get_related_model(self, related_name)
            type_str = related_model.__get_name_with_module__()
            labels_table.add_row(
                f".{related_name}", Text(type_str, style="dim"), print_values
            )
    tree = None
    if labels_table.rows:  # we might not have rows even if labels_data was non-empty
        tree = Tree(Text("Labels", style="bold green_yellow"), guide_style="dim")
        tree.add(labels_table)
    return tree


def _save_validated_records(
    labels: QuerySet | list | dict,
) -> list[str]:
    """Save validated records from public based on ontology_id_fields."""
    if not labels:
        return []
    registry = labels[0].__class__
    field = (
        REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid")
        if not hasattr(registry, "_ontology_id_field")
        else registry._ontology_id_field
    )
    # if the field value is None, use uid field
    label_uids = [getattr(label, field) for label in labels if label is not None]
    # save labels from ontology_ids
    if hasattr(registry, "_ontology_id_field") and label_uids:
        try:
            records = registry.from_values(label_uids, field=field, mute=True)
            save([r for r in records if r._state.adding])
        except Exception:  # noqa: S110
            pass
        field = "uid"
        label_uids = [label.uid for label in labels if label is not None]

    if issubclass(registry, CanCurate):
        validated = registry.validate(label_uids, field=field, mute=True)
        new_labels = [
            label for label, is_valid in zip(labels, validated) if not is_valid
        ]
        return new_labels
    return list(labels)


def save_validated_records(
    records: QuerySet | list | dict,
) -> list[str] | dict[str, list[str]]:
    """Save validated records from public based on ontology_id_fields."""
    if isinstance(records, dict):
        return {
            registry: _save_validated_records(registry_records)
            for registry, registry_records in records.items()
        }
    return _save_validated_records(records)


class LabelManager:
    """Label manager.

    This allows to manage untyped labels :class:`~lamindb.ULabel` and arbitrary
    typed labels (e.g., :class:`~bionty.CellLine`) and associate labels
    with features.
    """

    def __init__(self, sqlrecord: Artifact | Collection) -> None:
        # host is the sqlrecord that the label manager is attached to
        # we might rename _host to _sqlrecord in the future
        self._host = sqlrecord

    def __repr__(self) -> str:
        return self.describe(return_str=True)

    def describe(self, return_str=True) -> str:
        """Describe the labels."""
        tree = describe_labels(self._host)
        return format_rich_tree(tree, return_str=return_str)

    def add(
        self,
        records: SQLRecord | list[SQLRecord] | QuerySet,
        feature: Feature | None = None,
    ) -> None:
        """Add one or several labels and associate them with a feature.

        Args:
            records: Label records to add.
            feature: Feature under which to group the labels.
        """
        from .artifact import add_labels

        return add_labels(self._host, records=records, feature=feature)

    def get(
        self,
        feature: Feature,
        mute: bool = False,
        flat_names: bool = False,
    ) -> QuerySet | dict[str, QuerySet] | list:
        """Get labels given a feature.

        Args:
            feature: Feature under which labels are grouped.
            mute: Show no logging.
            flat_names: Flatten list to names rather than returning records.
        """
        from .artifact import get_labels

        return get_labels(self._host, feature=feature, mute=mute, flat_names=flat_names)

    def add_from(self, data: Artifact | Collection, transfer_logs: dict = None) -> None:
        """Add labels from an artifact or collection to another artifact or collection.

        Examples:

            ::

                artifact1 = ln.Artifact(pd.DataFrame(index=[0, 1])).save()
                artifact2 = ln.Artifact(pd.DataFrame(index=[2, 3])).save()
                records = ln.ULabel.from_values(["Label1", "Label2"], field="name").save()
                labels = ln.ULabel.filter(name__icontains = "label")
                artifact1.ulabels.set(labels)  # using the ManyToMany relationship `.ulabels`
                artifact2.labels.add_from(artifact1)  # using the `.labels` accessor that understands any label type
        """
        if transfer_logs is None:
            transfer_logs = {"mapped": [], "transferred": [], "run": None}
        from lamindb import settings

        using_key = settings._using_key
        for related_name, labels in _get_labels(data, instance=data._state.db).items():
            labels = labels.all()
            if not labels.exists():
                continue
            # look for features
            data_name_lower = data.__class__.__name__.lower()
            labels_by_features: dict = defaultdict(list)
            features = set()
            new_labels = save_validated_records(labels)
            if len(new_labels) > 0:
                transfer_fk_to_default_db_bulk(
                    new_labels, using_key, transfer_logs=transfer_logs
                )
            for label in labels:
                keys: list = []
                # if the link table doesn't follow this convention, we'll ignore it
                if not hasattr(label, f"links_{data_name_lower}"):
                    key = None
                    keys.append(key)
                else:
                    links = getattr(label, f"links_{data_name_lower}").filter(
                        **{f"{data_name_lower}_id": data.id}
                    )
                    for link in links:
                        if link.feature is not None:
                            features.add(link.feature)
                            key = link.feature.uid
                        else:
                            key = None
                        keys.append(key)
                label_returned = transfer_to_default_db(
                    label,
                    using_key,
                    transfer_logs=transfer_logs,
                    transfer_fk=False,
                    save=True,
                )
                # TODO: refactor return value of transfer to default db
                if label_returned is not None:
                    label = label_returned
                for key in keys:
                    labels_by_features[key].append(label)
            # treat features
            new_features = save_validated_records(list(features))
            if len(new_features) > 0:
                transfer_fk_to_default_db_bulk(
                    new_features, using_key, transfer_logs=transfer_logs
                )
                for feature in new_features:
                    transfer_to_default_db(
                        feature,  # type: ignore
                        using_key,
                        transfer_logs=transfer_logs,
                        transfer_fk=False,
                    )
                save(new_features)  # type: ignore
            if hasattr(self._host, related_name):
                for feature_uid, feature_labels in labels_by_features.items():
                    if feature_uid is not None:
                        feature_id = Feature.get(feature_uid).id
                    else:
                        feature_id = None
                    getattr(self._host, related_name).add(
                        *feature_labels, through_defaults={"feature_id": feature_id}
                    )


================================================
FILE: lamindb/models/_relations.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

import lamindb_setup as ln_setup
from django.db.models import ManyToManyField
from lamindb_setup._connect_instance import (
    get_owner_name_from_identifier,
    load_instance_settings,
)
from lamindb_setup.core._settings_store import instance_settings_file

from lamindb.models.sqlrecord import IsLink

if TYPE_CHECKING:
    from lamindb.models.sqlrecord import Registry, SQLRecord


def get_schema_modules(instance: str | None) -> set[str]:
    if instance is None or instance == "default":
        schema_modules = set(ln_setup.settings.instance.modules)
        schema_modules.add("core")
        return schema_modules
    owner, name = get_owner_name_from_identifier(instance)
    settings_file = instance_settings_file(name, owner)
    if settings_file.exists():
        modules = set(load_instance_settings(settings_file).modules)
    else:
        cache_filepath = (
            ln_setup.settings.cache_dir / f"instance--{owner}--{name}--uid.txt"
        )
        if cache_filepath.exists():
            modules = set(cache_filepath.read_text().split("\n")[1].split(","))
        else:
            raise ValueError(f"Instance {instance} not found")
    shared_schema_modules = set(ln_setup.settings.instance.modules).intersection(
        modules
    )
    shared_schema_modules.add("core")
    return shared_schema_modules


# this function here should likely be renamed
# it maps the __get_name_with_module__() onto the actual model
def dict_module_name_to_model_name(
    registry: Registry, instance: str | None = None
) -> dict[str, Registry]:
    schema_modules = get_schema_modules(instance)
    d: dict = {
        i.related_model.__get_name_with_module__(): i.related_model
        for i in registry._meta.related_objects
        if i.related_name is not None
        and i.related_model.__get_module_name__() in schema_modules
    }
    d.update(
        {
            i.related_model.__get_name_with_module__(): i.related_model
            for i in registry._meta.many_to_many
            if i.name is not None
            and i.related_model.__get_module_name__() in schema_modules
        }
    )
    return d


def dict_related_model_to_related_name(
    registry: type[SQLRecord], links: bool = False, instance: str | None = None
) -> dict[str, str]:
    def include(model: SQLRecord):
        return not links != issubclass(model, IsLink)

    schema_modules = get_schema_modules(instance)

    related_objects = registry._meta.related_objects + registry._meta.many_to_many
    d: dict = {
        record.related_model.__get_name_with_module__(): (
            record.related_name
            if not isinstance(record, ManyToManyField)
            else record.name
        )
        for record in related_objects
        if (
            record.name is not None
            and include(record.related_model)
            and record.related_model.__get_module_name__() in schema_modules
            and not (
                (
                    record.related_name
                    if not isinstance(record, ManyToManyField)
                    else record.name
                ).startswith("linked_in_")
            )
        )
    }
    if "RecordRecord" in d:
        d["RecordRecord"] = "values_record"
    return d


def get_related_name(features_type: type[SQLRecord]) -> str:
    from lamindb.models.schema import Schema

    candidates = [
        field.related_name
        for field in Schema._meta.related_objects
        if field.related_model == features_type
    ]
    if not candidates:
        raise ValueError(
            f"Can't create feature sets from {features_type.__name__} because it's not"
            " related to it!\nYou need to create a link model between Schema and"
            " your SQLRecord in your custom module.\nTo do so, add a"
            " line:\n_schemas = models.ManyToMany(Schema,"
            " related_name='mythings')\n"
        )
    return candidates[0]


================================================
FILE: lamindb/models/_run_cleanup.py
================================================
"""Background cleanup of report/environment artifacts after Run bulk delete.

Runnable as: python -m lamindb.models._run_cleanup --instance owner/name --ids 1,2,3 [--run-uid UID]
"""

import argparse
import logging

from lamin_utils import logger

import lamindb as ln


def main() -> None:
    parser = argparse.ArgumentParser(description="Clean up orphaned run artifacts.")
    parser.add_argument("--instance", required=True, help="Instance slug (owner/name).")
    parser.add_argument("--ids", required=True, help="Comma-separated artifact IDs.")
    parser.add_argument(
        "--run-uid",
        required=True,
        help="Run UID for log file name (run_cleanup_logs_{uid}.txt in cache dir).",
    )
    args = parser.parse_args()

    ln.connect(args.instance)

    file_handler = None
    log_path = ln.setup.settings.cache_dir / f"run_cleanup_logs_{args.run_uid}.txt"
    file_handler = logging.FileHandler(log_path, mode="a")
    logger.addHandler(file_handler)

    for aid_str in args.ids.split(","):
        aid = int(aid_str.strip())
        artifact = ln.Artifact.objects.filter(id=aid).first()
        if artifact is not None:
            assert artifact.kind == "__lamindb_run__", (
                f"artifact {artifact.uid} is not of __lamindb_run__ kind, aborting cleanup of artifacts {args.ids}"
            )
            try:
                artifact.delete(permanent=True)
                logger.important(f"deleted artifact {aid}")
            except Exception as e:
                logger.error(f"did not delete artifact {aid}: {e}")
                pass


if __name__ == "__main__":
    main()


================================================
FILE: lamindb/models/artifact.py
================================================
# ruff: noqa: TC004
from __future__ import annotations

import shutil
import types
import warnings
from collections import defaultdict
from pathlib import Path, PurePath, PurePosixPath
from typing import TYPE_CHECKING, Any, Iterator, Literal, TypeVar, Union, overload

import fsspec
import lamindb_setup as ln_setup
from django.db import ProgrammingError, models
from django.db.models import CASCADE, PROTECT, Q
from django.db.models.functions import Length
from lamin_utils import colors, logger
from lamindb_setup import settings as setup_settings
from lamindb_setup.core._hub_core import select_storage_or_parent
from lamindb_setup.core.hashing import HASH_LENGTH, hash_dir, hash_file
from lamindb_setup.core.upath import (
    LocalPathClasses,
    UPath,
    create_path,
    extract_suffix_from_path,
    fs_for_moving,
    get_stat_dir_cloud,
    get_stat_file_cloud,
)

from ..base.fields import (
    BigIntegerField,
    BooleanField,
    CharField,
    ForeignKey,
    TextField,
)
from ..base.users import current_user_id
from ..base.utils import deprecated, strict_classmethod
from ..core._compat import with_package_obj
from ..core._settings import settings
from ..errors import (
    FieldValidationError,
    InvalidArgument,
    NoStorageLocationForSpace,
    NoWriteAccess,
    UnknownStorageLocation,
    ValidationError,
)
from ._feature_manager import (
    FeatureManager,
    get_label_links,
)
from ._is_versioned import (
    IsVersioned,
    create_uid,
)
from ._relations import (
    dict_module_name_to_model_name,
    dict_related_model_to_related_name,
)
from .feature import Feature, JsonValue
from .has_parents import view_lineage
from .query_set import QuerySet, SQLRecordList
from .run import Run, TracksRun, TracksUpdates, User
from .save import check_and_attempt_clearing, check_and_attempt_upload
from .schema import Schema
from .sqlrecord import (
    BaseSQLRecord,
    Branch,
    IsLink,
    Space,
    SQLRecord,
    _get_record_kwargs,
)
from .storage import Storage
from .ulabel import ULabel


def _lazy_load_storage_module():
    """Lazy-import storage to avoid loading pandas/anndata at package import."""
    from ..core.storage import (
        delete_storage,
        infer_suffix,
        write_to_disk,
    )
    from ..core.storage.paths import (
        AUTO_KEY_PREFIX,
        auto_storage_key_from_artifact,
        auto_storage_key_from_artifact_uid,
        check_path_is_child_of_root,
        filepath_cache_key_from_artifact,
        filepath_from_artifact,
    )

    return types.SimpleNamespace(
        delete_storage=delete_storage,
        infer_suffix=infer_suffix,
        write_to_disk=write_to_disk,
        AUTO_KEY_PREFIX=AUTO_KEY_PREFIX,
        auto_storage_key_from_artifact=auto_storage_key_from_artifact,
        auto_storage_key_from_artifact_uid=auto_storage_key_from_artifact_uid,
        check_path_is_child_of_root=check_path_is_child_of_root,
        filepath_cache_key_from_artifact=filepath_cache_key_from_artifact,
        filepath_from_artifact=filepath_from_artifact,
    )


# Cache the storage utils on first use
_storage_cache: object | None = None


# refactor this module to group logic that needs storage access in a class
# in the future; then we don't need _s() anymore
def _s():
    global _storage_cache
    if _storage_cache is None:
        _storage_cache = _lazy_load_storage_module()
    return _storage_cache


WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-run"

WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run"


def _identify_zarr_type(storepath, *, check: bool = True):
    """Lazy-import to avoid loading storage at package import."""
    try:
        from ..core.storage._zarr import identify_zarr_type

        return identify_zarr_type(storepath, check=check)
    except ImportError:
        raise ImportError("Please install zarr: pip install 'lamindb[zarr]'") from None


if TYPE_CHECKING:
    from collections.abc import Iterable

    import pandas as pd
    from anndata import AnnData
    from fsspec import AbstractFileSystem
    from lamindb_setup.types import AnyPathStr
    from mudata import MuData  # noqa: TC004
    from polars import LazyFrame as PolarsLazyFrame
    from pyarrow.dataset import Dataset as PyArrowDataset
    from spatialdata import SpatialData  # noqa: TC004
    from tiledbsoma import Collection as SOMACollection
    from tiledbsoma import Experiment as SOMAExperiment
    from tiledbsoma import Measurement as SOMAMeasurement

    from ..base.types import (
        ArtifactKind,
        StrField,
    )
    from ..core.storage._backed_access import (
        AnnDataAccessor,
        BackedAccessor,
        SpatialDataAccessor,
    )
    from ..core.storage.types import ScverseDataStructures
    from ._label_manager import LabelManager
    from .block import ArtifactBlock
    from .collection import Collection
    from .project import Project, Reference
    from .query_manager import RelatedManager
    from .record import Record
    from .transform import Transform


OUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG = (
    "Cannot read this outdated artifact version: "
    "its files were overwritten and are no longer available.\n"
    "Read from the latest version: artifact.versions.get(is_latest=True)"
)


def process_pathlike(
    filepath: UPath,
    storage: Storage,
    using_key: str | None,
    skip_existence_check: bool = False,
) -> tuple[Storage, bool]:
    """Determines the appropriate storage for a given path and whether to use an existing storage key."""
    if not skip_existence_check:
        try:  # check if file exists
            if not filepath.exists():
                raise FileNotFoundError(filepath)
        except PermissionError:
            pass
    if _s().check_path_is_child_of_root(filepath, storage.root):
        use_existing_storage_key = True
        return storage, use_existing_storage_key
    else:
        # check whether the path is part of one of the existing
        # already-registered storage locations
        result = None
        # within the hub, we don't want to perform check_path_in_existing_storage
        if using_key is None:
            result = check_path_in_existing_storage(
                filepath, check_hub_register_storage=setup_settings.instance.is_on_hub
            )
        if isinstance(result, Storage):
            use_existing_storage_key = True
            return result, use_existing_storage_key
        else:
            # if the path is in the cloud, we have a good candidate
            # for the storage root: the bucket
            if not isinstance(filepath, LocalPathClasses):
                # for a cloud path, new_root is always the bucket name
                if filepath.protocol == "hf":
                    hf_path = filepath.fs.resolve_path(filepath.as_posix())
                    if hasattr(hf_path, "root"):
                        new_root = "hf://" + hf_path.root
                    else:
                        hf_path.path_in_repo = ""
                        new_root = "hf://" + hf_path.unresolve().rstrip("/")
                else:
                    if filepath.protocol == "s3":
                        # check that endpoint_url didn't propagate here
                        # as a part of the path string
                        assert "?" not in filepath.path  # noqa: S101
                    new_root = list(filepath.parents)[-1].as_posix().rstrip("/")
                # Re the Parallel execution of the logic below:
                # One of the threads (or processes) would start to write the hub record and then the test file.
                # The other ones would retrieve the hub record and the test file.
                # All of them would come out of the exercise with storage_record.instance_uid == setup_settings.instance.uid
                # and all of them would raise UnkownStorageLocation.
                # Then one of these threads will trigger storage_record.delete() but also this is idempotent;
                # this means they all throw the same error and deletion of the inexistent stuff (hub record, marker file)
                # would just silently fail.
                # Edge case: A user legitimately creates a storage location and another user runs this here at the exact same time.
                # There is no way to decide then which is the legitimate creation.
                storage_record = Storage(root=new_root).save()
                if storage_record.instance_uid == setup_settings.instance.uid:
                    # we don't want to inadvertently create managed storage locations
                    # hence, we revert the creation and throw an error
                    storage_record.delete()
                    raise UnknownStorageLocation(
                        f"Path {filepath} is not contained in any known storage location:\n{Storage.to_dataframe()[['uid', 'root', 'type']]}\n\n"
                        f"Create a managed storage location that contains the path, e.g., by calling: ln.Storage(root='{new_root}').save()"
                    )
                use_existing_storage_key = True
                return storage_record, use_existing_storage_key
            # if the filepath is local
            else:
                use_existing_storage_key = False
                # if the default storage is local we'll throw an error if the user
                # doesn't provide a key
                if storage.type == "local":
                    return storage, use_existing_storage_key
                # if the default storage is in the cloud (the file is going to
                # be uploaded upon saving it), we treat the filepath as a cache
                else:
                    return storage, use_existing_storage_key


def process_data(
    provisional_uid: str,
    data: AnyPathStr | pd.DataFrame | AnnData,
    format: str | None,
    key: str | None,
    storage: Storage,
    using_key: str | None,
    skip_existence_check: bool = False,
    is_replace: bool = False,
    to_disk_kwargs: dict[str, Any] | None = None,
) -> tuple[Any, Path | UPath, str, Storage, bool]:
    """Serialize a data object that's provided as file or in memory.

    if not overwritten, data gets stored in default storage
    """
    if with_package_obj(data, "AnnData", "anndata", lambda obj: True)[0]:
        is_anndata = True
        is_pathlike = False
    elif isinstance(data, (str, Path, UPath)):
        is_anndata = False
        is_pathlike = True
    else:
        is_anndata = False
        is_pathlike = False

    if key is not None:
        key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
        # use suffix as the (adata) format if the format is not provided
        if is_anndata and format is None and len(key_suffix) > 0:
            format = key_suffix[1:]
    else:
        key_suffix = None

    if is_pathlike:
        access_token = (
            storage._access_token if hasattr(storage, "_access_token") else None
        )
        path = create_path(data, access_token=access_token)
        # we don't resolve http links because they can resolve into a different domain
        # for example into a temporary url
        if path.protocol not in {"http", "https"}:
            path = path.resolve()

        storage, use_existing_storage_key = process_pathlike(
            path,
            storage=storage,
            using_key=using_key,
            skip_existence_check=skip_existence_check,
        )
        suffix = extract_suffix_from_path(path)
        memory_rep = None
    elif (
        is_anndata
        or data_is_dataframe(data)
        or data_is_scversedatastructure(data, "MuData")
        or data_is_scversedatastructure(data, "SpatialData")
    ):
        storage = storage
        memory_rep = data
        suffix = _s().infer_suffix(data, format)
    else:
        raise NotImplementedError(
            f"Do not know how to create an Artifact from {data}, pass a path instead."
        )

    # Check for suffix consistency
    if key_suffix is not None and key_suffix != suffix and not is_replace:
        # consciously omitting a trailing period
        if is_pathlike:
            message = f"The passed path's suffix '{suffix}' must match the passed key's suffix '{key_suffix}'."
        else:
            message = f"The passed key's suffix '{key_suffix}' must match the passed path's suffix '{suffix}'."
        raise InvalidArgument(message)

    # in case we have an in-memory representation, we need to write it to disk
    if memory_rep is not None:
        path = settings.cache_dir / f"{provisional_uid}{suffix}"
        logger.info("writing the in-memory object into cache")
        if to_disk_kwargs is None:
            to_disk_kwargs = {}
        _s().write_to_disk(data, path, **to_disk_kwargs)
        use_existing_storage_key = False

    return memory_rep, path, suffix, storage, use_existing_storage_key


def get_stat_or_artifact(
    path: UPath,
    storage: Record,
    key: str | None = None,
    check_hash: bool = True,
    is_replace: bool = False,
    instance: str | None = None,
    skip_hash_lookup: bool = False,
) -> Union[tuple[int, str | None, str | None, int | None, Artifact | None], Artifact]:
    """Retrieves file statistics or an existing artifact based on the path, hash, and key."""
    n_files = None
    if settings.creation.artifact_skip_size_hash:
        return None, None, None, n_files, None
    stat = path.stat()  # one network request
    if not isinstance(path, LocalPathClasses):
        size, hash, hash_type = None, None, None
        if stat is not None:
            # convert UPathStatResult to fsspec info dict
            stat = stat.as_info()
            if (store_type := stat["type"]) == "file":
                size, hash, hash_type = get_stat_file_cloud(stat)
            elif store_type == "directory":
                size, hash, hash_type, n_files = get_stat_dir_cloud(path)
        if hash is None:
            logger.warning(f"did not add hash for {path}")
            return size, hash, hash_type, n_files, None
    else:
        if path.is_dir():
            size, hash, hash_type, n_files = hash_dir(path)
        else:
            size, hash, hash_type = hash_file(path)
    if not check_hash:
        return size, hash, hash_type, n_files, None
    # Empty files all share the same content hash; skip cross-artifact hash
    # lookup so creating a new empty file path yields a new artifact.
    if n_files is None and size == 0:
        skip_hash_lookup = True
    previous_artifact_version = None
    artifacts_qs = Artifact.objects.using(instance)
    if skip_hash_lookup:
        artifact_with_same_hash_exists = False
        if key is not None and not is_replace:
            # only search for a previous version of the artifact
            # ignoring hash
            queryset_same_hash_or_same_key = artifacts_qs.filter(
                ~Q(branch_id=-1),
                key=key,
                storage=storage,
            ).order_by("-created_at")
        else:
            queryset_same_hash_or_same_key = []
    else:
        # this purposefully leaves out the storage location and key that we have
        # in the hard database unique constraints
        # so that the user is able to find artifacts with the same hash across
        # storage locations and keys
        # if this is not desired, set skip_hash_lookup=True
        if key is None or is_replace:
            queryset_same_hash = artifacts_qs.filter(~Q(branch_id=-1), hash=hash)
            artifact_with_same_hash_exists = queryset_same_hash.count() > 0
        else:
            # the following query achieves one more thing beyond hash lookup
            # it allows us to find a previous version of the artifact based on
            # matching key & storage even if the hash is different
            # we do this here so that we don't have to do an additional query later
            # see the `previous_artifact_version` variable below
            queryset_same_hash_or_same_key = artifacts_qs.filter(
                ~Q(branch_id=-1),
                Q(hash=hash) | Q(key=key, storage=storage),
            ).order_by("-created_at")
            queryset_same_hash = queryset_same_hash_or_same_key.filter(hash=hash)
            artifact_with_same_hash_exists = queryset_same_hash.count() > 0
    if key is not None and not is_replace:
        if (
            not artifact_with_same_hash_exists
            and queryset_same_hash_or_same_key.count() > 0
        ):
            logger.important(
                f"creating new artifact version for key '{key}' in storage '{storage.root}'"
            )
            previous_artifact_version = queryset_same_hash_or_same_key[0]
    if artifact_with_same_hash_exists:
        artifact_with_same_hash = queryset_same_hash[0]
        logger.important(
            f"returning artifact with same hash: {artifact_with_same_hash}; to track this artifact as an input, use: ln.Artifact.get()"
        )
        return artifact_with_same_hash
    else:
        return size, hash, hash_type, n_files, previous_artifact_version


def check_path_in_existing_storage(
    path: Path | UPath,
    check_hub_register_storage: bool = False,
    using_key: str | None = None,
) -> Storage | None:
    for storage in Storage.objects.using(using_key).order_by(Length("root").desc()):
        # if path is part of storage, return it
        if _s().check_path_is_child_of_root(path, root=storage.root):
            return storage
    # we don't see parents registered in the db, so checking the hub
    # just check for 2 writable cloud protocols, maybe change in the future
    if check_hub_register_storage and getattr(path, "protocol", None) in {"s3", "gs"}:
        result = select_storage_or_parent(path.as_posix())
        if result is not None:
            return Storage(**result, _skip_preparation=True).save()
    return None


def get_relative_path_to_directory(
    path: PurePath | Path | UPath, directory: PurePath | Path | UPath
) -> PurePath | Path | UPath:
    if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):
        # this is safer for cloud paths such as http paths
        relpath = PurePath(
            path.as_posix().replace(directory.as_posix(), "").lstrip("/")
        )
    elif isinstance(directory, LocalPathClasses):
        relpath = path.resolve().relative_to(directory.resolve())  # type: ignore
    elif isinstance(directory, PurePath):
        relpath = path.relative_to(directory)
    else:
        raise TypeError("Directory not of type Path or UPath")
    return relpath


def get_artifact_kwargs_from_data(
    *,
    data: Path | UPath | str | pd.DataFrame | ScverseDataStructures,
    key: str | None,
    run: Run | None,
    format: str | None,
    provisional_uid: str,
    version_tag: str | None,
    storage: Storage,
    using_key: str | None = None,
    is_replace: bool = False,
    skip_check_exists: bool = False,
    overwrite_versions: bool | None = None,
    skip_hash_lookup: bool = False,
    to_disk_kwargs: dict[str, Any] | None = None,
    key_is_virtual: bool | None = None,
):
    memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
        provisional_uid,
        data,
        format,
        key,
        storage,
        using_key,
        skip_check_exists,
        is_replace=is_replace,
        to_disk_kwargs=to_disk_kwargs,
    )

    check_path_in_storage = False
    real_key = None
    if use_existing_storage_key:
        inferred_key = get_relative_path_to_directory(
            path=path, directory=UPath(storage.root)
        ).as_posix()
        if key is None:
            key = inferred_key
        elif key != inferred_key:
            real_key = inferred_key
        check_path_in_storage = True
    else:
        storage = storage
    stat_or_artifact = get_stat_or_artifact(
        path=path,
        storage=storage,
        key=key,
        instance=using_key,
        is_replace=is_replace,
        skip_hash_lookup=skip_hash_lookup,
    )
    if not isinstance(path, LocalPathClasses):
        local_filepath = None
        cloud_filepath = path
    else:
        local_filepath = path
        cloud_filepath = None
    privates = {
        "local_filepath": local_filepath,
        "cloud_filepath": cloud_filepath,
        "memory_rep": memory_rep,
        "check_path_in_storage": check_path_in_storage,
    }
    if isinstance(stat_or_artifact, Artifact):
        existing_artifact = stat_or_artifact
        # if the artifact was unsuccessfully saved, we want to
        # enable re-uploading after returning the artifact object
        # the upload is triggered by whether the privates are returned
        if existing_artifact._storage_ongoing:
            privates["key"] = key
            returned_privates = privates  # re-upload necessary
        else:
            returned_privates = {"key": key}
        returned_privates["is_artifact_storage_managed_by_current_instance"] = (
            existing_artifact.storage.instance_uid == setup_settings.instance.uid
        )
        return existing_artifact, returned_privates
    else:
        size, hash, hash_type, n_files, revises = stat_or_artifact

    # update local path
    if revises is not None:  # update provisional_uid
        provisional_uid, revises = create_uid(revises=revises, version_tag=version_tag)
        if settings.cache_dir in path.parents:
            path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
            privates["local_filepath"] = path

    log_storage_hint(
        check_path_in_storage=check_path_in_storage,
        storage=storage,
        key=key,
        uid=provisional_uid,
        suffix=suffix,
        is_dir=n_files is not None,
    )

    if overwrite_versions is None:
        overwrite_versions = n_files is not None

    if check_path_in_storage:
        # True here means that we have a path in an existing storage with a virtual key
        real_key_is_set = real_key is not None
        if key_is_virtual is not None and key_is_virtual != real_key_is_set:
            raise ValueError(
                f"Passing a path in an existing storage {'with' if real_key_is_set else 'without'} "
                f"a virtual key and _key_is_virtual={key_is_virtual} is incompatible."
            )
        # we use an actual storage key if key is not provided explicitly
        set_key_is_virtual = real_key_is_set
    else:
        # do we use a virtual or an actual storage key?
        set_key_is_virtual = (
            settings.creation._artifact_use_virtual_keys
            if key_is_virtual is None
            else key_is_virtual
        )

    # needed to check if the artifact storage is managed by the current instance on artifact init
    privates["is_artifact_storage_managed_by_current_instance"] = (
        storage.instance_uid == setup_settings.instance.uid
    )

    kwargs = {
        "uid": provisional_uid,
        "suffix": suffix,
        "hash": hash,
        "_hash_type": hash_type,
        "key": key,
        "size": size,
        "storage_id": storage.id,
        "n_files": n_files,
        "_overwrite_versions": overwrite_versions,  # True for folder, False for file
        "n_observations": None,  # to implement
        "run_id": run.id if run is not None else None,
        "run": run,
        "_key_is_virtual": set_key_is_virtual,
        "revises": revises,
        "_real_key": real_key,
    }
    return kwargs, privates


def log_storage_hint(
    *,
    check_path_in_storage: bool,
    storage: Storage | None,
    key: str | None,
    uid: str,
    suffix: str,
    is_dir: bool,
) -> None:
    hint = ""
    if check_path_in_storage:
        display_root = storage.root  # type: ignore
        # check whether path is local
        if fsspec.utils.get_protocol(storage.root) == "file":  # type: ignore
            # if it's a local path, check whether it's in the current working directory
            root_path = Path(storage.root)  # type: ignore
            if _s().check_path_is_child_of_root(root_path, Path.cwd()):
                # only display the relative path, not the fully resolved path
                display_root = root_path.relative_to(Path.cwd())  # type: ignore
        hint += f"path in storage '{display_root}'"  # type: ignore
    else:
        hint += "path content will be copied to default storage upon `save()`"
    if key is None:
        storage_key = _s().auto_storage_key_from_artifact_uid(uid, suffix, is_dir)
        hint += f" with key `None` ('{storage_key}')"
    else:
        hint += f" with key '{key}'"
    logger.hint(hint)


def data_is_dataframe(data: Any) -> bool:
    # TODO: maybe check also for pandas.DataFrame subclasses,
    # but in this case also infer_suffix should be updated
    return with_package_obj(data, "DataFrame", "pandas", lambda obj: True)[0]


def data_is_scversedatastructure(
    data: ScverseDataStructures | AnyPathStr,
    structure_type: Literal["AnnData", "MuData", "SpatialData"] | None = None,
    cloud_warning: bool = True,
) -> bool:
    """Determine whether a specific in-memory object or a path is any or a specific scverse data structure."""
    file_suffix = None
    if structure_type == "AnnData":
        file_suffix = ".h5ad"
    elif structure_type == "MuData":
        file_suffix = ".h5mu"
    # SpatialData does not have a unique suffix but `.zarr`

    # AnnData allows both AnnDataAccessor and AnnData
    class_name = data.__class__.__name__
    if structure_type is None:
        return any(
            class_name
            in (["AnnData", "AnnDataAccessor"] if cl_name == "AnnData" else [cl_name])
            for cl_name in ["AnnData", "MuData", "SpatialData"]
        )
    elif class_name in (
        ["AnnData", "AnnDataAccessor"]
        if structure_type == "AnnData"
        else [structure_type]
    ):
        return True

    data_type = structure_type.lower()
    if isinstance(data, (str, Path, UPath)):
        data_path = UPath(data)

        if file_suffix in data_path.suffixes:
            return True

        if data_path.suffix == ".zarr":
            type_suffix = f".{data_type}"
            if type_suffix in data_path.suffixes:
                return True

            # check only for local, expensive for cloud
            if fsspec.utils.get_protocol(data_path.as_posix()) == "file":
                return (
                    _identify_zarr_type(
                        data_path if structure_type == "AnnData" else data,
                        check=True if structure_type == "AnnData" else False,
                    )
                    == data_type
                )
            elif cloud_warning:
                logger.warning(
                    f"we do not check whether cloud zarr is {structure_type}"
                )
                return False

    return False


def data_is_soma_experiment(data: SOMAExperiment | AnyPathStr) -> bool:
    # We are not importing tiledbsoma here to keep loaded modules minimal
    if hasattr(data, "__class__") and data.__class__.__name__ == "Experiment":
        return True
    if isinstance(data, (str, Path, UPath)):
        return UPath(data).suffix == ".tiledbsoma"
    return False


def check_otype_artifact(
    data: AnyPathStr | pd.DataFrame | ScverseDataStructures,
    otype: str | None = None,
    cloud_warning: bool = True,
) -> str:
    if otype is not None:
        return otype

    if isinstance(data, (str, Path, UPath)):
        is_pathlike = True
        suffix = UPath(data).suffix
    else:
        is_pathlike = False
        suffix = None

    if (is_pathlike and suffix in {".parquet", ".csv", ".ipc"}) or data_is_dataframe(
        data
    ):
        logger.warning("data is a DataFrame, please use .from_dataframe()")
        otype = "DataFrame"
        return otype
    if data_is_scversedatastructure(data, "AnnData", cloud_warning):
        if not is_pathlike:
            logger.warning("data is an AnnData, please use .from_anndata()")
        otype = "AnnData"
    elif data_is_scversedatastructure(data, "MuData", cloud_warning):
        if not is_pathlike:
            logger.warning("data is a MuData, please use .from_mudata()")
        otype = "MuData"
    elif data_is_scversedatastructure(data, "SpatialData", cloud_warning):
        if not is_pathlike:
            logger.warning("data is a SpatialData, please use .from_spatialdata()")
        otype = "SpatialData"
    elif not is_pathlike:
        raise TypeError("data has to be a string, Path, UPath")
    return otype


def populate_subsequent_run(record: Artifact | Collection, run: Run | None) -> None:
    if run is None:
        return
    if record.run is None:
        record.run = run
    elif record.run != run:
        record.recreating_runs.add(run)
        record._subsequent_run_id = run.id


# also see current_run() in core._data
def get_run(run: Run | None) -> Run | None:
    from ..core._context import context
    from ..core._functions import get_current_tracked_run

    if run is None:
        run = get_current_tracked_run()
        if run is None:
            run = context.run
        if run is None and not settings.creation.artifact_silence_missing_run_warning:
            isettings = setup_settings.instance
            if not (isettings._is_clone or isettings.is_read_only_connection):
                logger.warning(WARNING_RUN_TRANSFORM)
    # suppress run by passing False
    elif not run:
        run = None
    return run


def save_staged_schemas(self: Artifact) -> None:
    if hasattr(self, "_staged_schemas"):
        from lamindb.models._feature_manager import get_schema_by_slot_

        existing_staged_schemas = get_schema_by_slot_(self)
        saved_staged_schemas = {}
        for key, schema in self._staged_schemas.items():
            if isinstance(schema, Schema) and schema._state.adding:
                schema.save()
                saved_staged_schemas[key] = schema
            if key in existing_staged_schemas:
                # remove existing feature set on the same slot
                self.schemas.remove(existing_staged_schemas[key])
        if len(saved_staged_schemas) > 0:
            s = "s" if len(saved_staged_schemas) > 1 else ""
            display_schema_keys = ",".join(
                f"'{key}'" for key in saved_staged_schemas.keys()
            )
            logger.save(
                f"saved {len(saved_staged_schemas)} feature set{s} for slot{s}:"
                f" {display_schema_keys}"
            )


def save_schema_links(self: Artifact) -> None:
    from lamindb.models.save import bulk_create

    if hasattr(self, "_staged_schemas"):
        links = []
        for slot, schema in self._staged_schemas.items():
            kwargs = {
                "artifact_id": self.id,
                "schema_id": schema.id,
                "slot": slot,
            }
            links.append(Artifact.schemas.through(**kwargs))
        bulk_create(links, ignore_conflicts=True)


def validate_feature(feature: Feature, records: list[SQLRecord]) -> None:
    """Validate feature record, adjust feature.dtype based on labels records."""
    if not isinstance(feature, Feature):
        raise TypeError("feature has to be of type Feature")
    if feature._state.adding:
        registries = {record.__class__.__get_name_with_module__() for record in records}
        registries_str = "|".join(registries)
        msg = f"ln.Feature(name='{feature.name}', type='cat[{registries_str}]').save()"
        raise ValidationError(f"Feature not validated. If it looks correct: {msg}")


def get_labels(
    self,
    feature: Feature,
    mute: bool = False,
    flat_names: bool = False,
) -> QuerySet | dict[str, QuerySet] | list:
    """{}"""  # noqa: D415
    from .record import Record

    if not isinstance(feature, Feature):
        raise TypeError("feature has to be of type Feature")
    dtype_str = feature._dtype_str
    if dtype_str is None or not dtype_str.startswith("cat["):
        raise ValueError("feature does not have linked labels")
    registries_to_check = dtype_str.replace("cat[", "").rstrip("]").split("|")
    if len(registries_to_check) > 1 and not mute:
        logger.warning("labels come from multiple registries!")
    # return an empty query set if self.id is still None
    if self.id is None:
        return QuerySet(self.__class__)
    qs_by_registry = {}
    for registry in registries_to_check:
        # currently need to distinguish between ULabel and non-ULabel, because
        # we only have the feature information for Label
        if registry in {"ULabel", "Record"}:
            links_to_labels = get_label_links(self, registry, feature)
            label_ids = [
                (link.ulabel_id if registry == "ULabel" else link.record_id)
                for link in links_to_labels
            ]
            model = ULabel if registry == "ULabel" else Record
            qs_by_registry[registry] = model.objects.using(self._state.db).filter(
                id__in=label_ids
            )
        elif registry in self.features._accessor_by_registry:
            qs_by_registry[registry] = getattr(
                self, self.features._accessor_by_registry[registry]
            ).all()
    if flat_names:
        # returns a flat list of names
        from .sqlrecord import get_name_field

        values = []
        for v in qs_by_registry.values():
            values += v.to_list(get_name_field(v))
        return values
    if len(registries_to_check) == 1 and registry in qs_by_registry:
        return qs_by_registry[registry]
    else:
        return qs_by_registry


def add_labels(
    self,
    records: SQLRecord | list[SQLRecord] | QuerySet | Iterable,
    feature: Feature | None = None,
    *,
    field: StrField | None = None,
    from_curator: bool = False,
) -> None:
    """{}"""  # noqa: D415
    if self._state.adding:
        raise ValueError("Please save the artifact/collection before adding a label!")

    if isinstance(records, (QuerySet, QuerySet.__base__)):  # need to have both
        records = records.to_list()
    if isinstance(records, (str, SQLRecord)):
        records = [records]
    if not isinstance(records, list):  # avoids warning for pd Series
        records = list(records)
    # create records from values
    if len(records) == 0:
        return None
    if isinstance(records[0], str):  # type: ignore
        records_validated = []
        # feature is needed if we want to create records from values
        if feature is None:
            raise ValueError(
                "Please pass a feature, e.g., via: label = ln.ULabel(name='my_label',"
                " feature=ln.Feature(name='my_feature'))"
            )
        dtype_str = feature._dtype_str
        if dtype_str.startswith("cat["):
            orm_dict = dict_module_name_to_model_name(Artifact)
            for reg in dtype_str.replace("cat[", "").rstrip("]").split("|"):
                registry = orm_dict.get(reg)
                records_validated += registry.from_values(records, field=field)

        # feature doesn't have registries and therefore can't create records from values
        # ask users to pass records
        if len(records_validated) == 0:
            raise ValueError(
                "Please pass a record (a `SQLRecord` object), not a string, e.g., via:"
                " label"
                f" = ln.Record(name='{records[0]}')"  # type: ignore
            )
        records = records_validated

    for record in records:
        if record._state.adding:
            raise ValidationError(
                f"{record} not validated. If it looks correct: record.save()"
            )

    if feature is None:
        d = dict_related_model_to_related_name(self.__class__)
        # strategy: group records by registry to reduce number of transactions
        records_by_related_name: dict = {}
        for record in records:
            related_name = d.get(record.__class__.__get_name_with_module__())
            if related_name is None:
                raise ValueError(f"Can't add labels to {record.__class__} record!")
            if related_name not in records_by_related_name:
                records_by_related_name[related_name] = []
            records_by_related_name[related_name].append(record)
        for related_name, records in records_by_related_name.items():
            getattr(self, related_name).add(*records)
    else:
        validate_feature(feature, records)  # type:ignore
        records_by_registry = defaultdict(list)
        schemas = self.schemas.filter(itype="Feature")
        internal_features = set()  # type: ignore
        if len(schemas) > 0:
            for schema in schemas:
                internal_features = internal_features.union(
                    set(schema.members.values_list("name", flat=True))
                )  # type: ignore
        for record in records:
            records_by_registry[record.__class__.__get_name_with_module__()].append(
                record
            )
        for registry_name, records in records_by_registry.items():
            if not from_curator and feature.name in internal_features:
                raise ValidationError(
                    "Cannot manually annotate a feature measured *within* the dataset. Please use a Curator."
                )
            dtype_str = feature._dtype_str
            if registry_name not in dtype_str:
                if not dtype_str.startswith("cat"):
                    raise ValidationError(
                        f"Feature {feature.name} needs dtype='cat' for label annotation, currently has dtype='{dtype_str}'"
                    )
                if registry_name not in dtype_str:
                    new_dtype = dtype_str.rstrip("]") + f"|{registry_name}]"
                    raise ValidationError(
                        f"Label type {registry_name} is not valid for Feature(name='{feature.name}', dtype='{dtype_str}'), consider a feature with dtype='{new_dtype}'"
                    )
            if registry_name not in self.features._accessor_by_registry:
                logger.warning(f"skipping {registry_name}")
                continue
            if len(records) == 0:
                continue
            features_labels = {
                registry_name: [(feature, label_record) for label_record in records]
            }
            self.features._add_label_feature_links(
                features_labels,
            )


def delete_permanently(artifact: Artifact, storage: bool | None, using_key: str):
    # need to grab file path before deletion
    try:
        path, _ = _s().filepath_from_artifact(artifact, using_key)
    except OSError:
        # we can still delete the record
        logger.warning("Could not get path")
        storage = False
    # only delete in storage if DB delete is successful
    # DB delete might error because of a foreign key constraint violated etc.
    if artifact._overwrite_versions and artifact.is_latest:
        logger.important(
            "deleting all versions of this artifact because they all share the same store"
        )
        # artifact.versions pulls only versions that are not in trash
        # this query set below contains all versions including those that are in trash
        versions = Artifact.objects.using(artifact._state.db).filter(
            uid__startswith=artifact.stem_uid
        )
        for version in versions:
            _delete_skip_storage(version)
    else:
        artifact._delete_skip_storage()
    # by default do not delete storage if deleting only a previous version
    # and the underlying store is mutable
    if artifact._overwrite_versions and not artifact.is_latest:
        delete_in_storage = False
        if storage:
            logger.warning(
                "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
            )
    elif artifact.key is None or (
        artifact._key_is_virtual and artifact._real_key is None
    ):
        # do not ask for confirmation also if storage is None
        delete_in_storage = storage is None or storage
    else:
        # for artifacts with non-virtual semantic storage keys (key is not None)
        # ask for extra-confirmation if storage is None
        # the wording here is critical to avoid accidental deletions
        if storage is None:
            response = input(
                f"Artifact record deleted. Do you ALSO want to delete the data in storage at {path}? (y/n) You can't undo"
                " this action."
            )
            delete_in_storage = response == "y"
        else:
            delete_in_storage = storage
    if not delete_in_storage:
        logger.important(f"a file/folder remains here: {path}")
    # we don't yet have logic to bring back the deleted metadata record
    # in case storage deletion fails - this is important for ACID down the road
    if delete_in_storage:
        delete_msg = _s().delete_storage(path, raise_file_not_found_error=False)
        if delete_msg != "did-not-delete":
            logger.success(f"deleted {colors.yellow(f'{path}')}")


class LazyArtifact:
    """Lazy artifact for streaming to auto-generated internal paths.

    This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
    and register the path as an artifact (see :class:`~lamindb.Artifact`).

    This object creates a real artifact on `.save()` with the provided arguments.

    Args:
        suffix: The suffix for the auto-generated internal path
        overwrite_versions: Whether to overwrite versions.
        **kwargs: Keyword arguments for the artifact to be created.

    Examples:

        Create a lazy artifact, write to the path and save to get a real artifact::

            lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
            zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
            artifact = lazy.save()
    """

    def __init__(self, suffix: str, overwrite_versions: bool, **kwargs):
        self.kwargs = kwargs
        self.kwargs["overwrite_versions"] = overwrite_versions

        if (key := kwargs.get("key")) is not None and extract_suffix_from_path(
            PurePosixPath(key)
        ) != suffix:
            raise ValueError(
                "The suffix argument and the suffix of key should be the same."
            )

        uid, _ = create_uid(n_full_id=20)
        storage_key = _s().auto_storage_key_from_artifact_uid(
            uid, suffix, overwrite_versions=overwrite_versions
        )
        storepath = setup_settings.storage.root / storage_key

        self._path = storepath

    @property
    def path(self) -> UPath:
        return self._path

    def save(self, upload: bool | None = None, **kwargs) -> Artifact:
        artifact = Artifact(self.path, _is_internal_call=True, **self.kwargs)
        return artifact.save(upload=upload, **kwargs)

    def __repr__(self) -> str:  # pragma: no cover
        show_kwargs = {k: v for k, v in self.kwargs.items() if v is not None}
        return (
            f"LazyArtifact object with\n path: {self.path}\n arguments: {show_kwargs}"
        )


T = TypeVar("T", bound=BaseSQLRecord)


def _sqlrecord_or_id(
    model: type[T],
    sqlrecord: T | None,
    sqlrecord_id: int | None,
    check_type: bool = True,
) -> T | None:
    if sqlrecord is not None and sqlrecord_id is not None:
        raise ValueError(
            f"Do not pass both {model.__name__} and its id at the same time."
        )

    if sqlrecord is None and sqlrecord_id is None:
        return None
    elif sqlrecord is not None:
        assert not check_type or isinstance(sqlrecord, model), (
            f"Expected {model.__name__}, got {type(sqlrecord).__name__}."
        )
        return sqlrecord
    elif sqlrecord_id is not None:
        return model.objects.get(id=sqlrecord_id)


class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
    """Datasets & models stored as files, folders, or arrays.

    Some artifacts are table- or array-like, e.g., when stored as `.parquet`, `.h5ad`, `.zarr`, or `.tiledb`.

    Args:
        path: `AnyPathStr` A path to a local or remote folder or file from which to create the artifact.
        key: `str | None = None` A key within the storage location, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
        description: `str | None = None` A description.
        kind: `Literal["dataset", "model"] | str | None = None` Distinguish models from datasets from other files & folders.
        features: `dict | None = None` External features to annotate via :class:`~lamindb.models.FeatureManager.set_values`.
        schema: `Schema | None = None` A schema to validate features.
        revises: `Artifact | None = None` Previous version of the artifact. An alternative to passing `key` when creating a new version.
        overwrite_versions: `bool | None = None` Whether to overwrite versions. Defaults to `True` for folders and `False` for files.
        run: `Run | bool | None = None` The run that creates the artifact. If `False`, suppress tracking the run.
            If `None`, infer the run from the global run context.
        branch: `Branch | None = None` The branch of the artifact. If `None`, uses the current branch.
        space: `Space | None = None` The space of the artifact. If `None`, uses the current space.
        storage: `Storage | None = None` The storage location for the artifact. If `None`, uses the default (:attr:`~lamindb.core.Settings.storage`).
        skip_hash_lookup: `bool = False` Skip the hash lookup so that a new artifact is created even if an artifact with the same hash already exists.
            Empty files are always treated as if this were `True` because empty content hashes are not used for deduplication.

    Examples:

        Create an artifact **from a local file or folder**::

            artifact = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
            artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()

        Calling `.save()` copies or uploads the file to the default storage location of your lamindb instance.
        If you create an artifact **from a remote file or folder**, lamindb registers the S3 `key` and avoids copying the data::

            artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()  # can omit key/description because file is remote

        If you then want to query & access the artifact later on, this is how you do it::

            artifact = ln.Artifact.get(key="examples/my_file.parquet")
            cached_path = artifact.cache()  # sync to local cache & get local path

        If the storage format supports it, you can load the artifact directly into memory or query it through a streaming interface, e.g., for parquet files::

            df = artifact.load()               # load parquet file as DataFrame
            pyarrow_dataset = artifact.open()  # open a streaming file-like object

        To bulk-create artifacts for every file in a directory and **group them in a folder**, use :meth:`~lamindb.Artifact.from_dir`::

            artifacts = ln.Artifact.from_dir("project_alpha/run_001").save()  # create one artifact per file in the directory
            artifacts = ln.Artifact.filter(key__startswith="project_alpha/run_001/")  # query ingested artifacts via the folder prefix

        To create a **versioned immutable collection** of artifacts for a data release, use :class:`~lamindb.Collection`::

            collection = ln.Collection(artifacts, key="project_alpha/run_001").save()

        .. dropdown:: Virtual folders (key prefixes) vs. :class:`~lamindb.Collection` objects

            - prefix query on `key`: If a colleague adds a new file to that prefix tomorrow, your `filter(key__startswith=...)` result will change.
            - collection: A collection object provides a `uid` for every version and its content won't change.

        If you want to **validate & annotate** a dataframe or an array using the feature & label registries,
        pass `schema` to one of the `.from_dataframe()`, `.from_anndata()`, ... constructors::

            artifact = ln.Artifact.from_dataframe(
                "./my_file.parquet",
                key="my_dataset.parquet",
                schema="valid_features"
            ).save()

        To annotate by **external features**::

            artifact = ln.Artifact("./my_file.parquet", features={"cell_type_by_model": "T cell"}).save()

        You can make a **new version** of an artifact by passing an existing `key`::

            artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
            artifact_v2.versions.to_dataframe()  # see all versions

        You can write artifacts to **non-default storage locations** by passing the `storage` argument::

            storage_loc = ln.Storage.get(root="s3://my_bucket")  # get storage location, or create via ln.Storage(root="s3://my_bucket").save()
            ln.Artifact("./my_file.parquet", key="examples/my_file.parquet", storage=storage_loc).save()  # upload to s3://my_bucket

    Notes:

        .. _storage-formats-note:

        .. dropdown:: Storage formats & object types

            The `Artifact` registry tracks the storage format via :attr:`suffix` and an abstract object type via :attr:`otype`.

            ================  ======================================  ================  ====================================================================
            description       :attr:`suffix`                          :attr:`otype`     Python type examples
            ================  ======================================  ================  ====================================================================
            table             `.csv`, `.tsv`, `.parquet`, `.ipc`      `"DataFrame"`     `pandas.DataFrame`, `polars.DataFrame`, `pyarrow.Table`
            annotated matrix  `.h5ad`, `.zarr`, `.h5mu`               `"AnnData"`       `anndata.AnnData`
            stacked matrix    `.zarr`                                 `"MuData"`        `mudata.MuData`
                              `.tiledbsoma`                           `"tiledbsoma"`    `tiledbsoma.Experiment`
            spatial data      `.zarr`                                 `"SpatialData"`   `spatialdata.SpatialData`
            generic arrays    `.h5`, `.zarr`, `.tiledb`               ---               `h5py.Dataset`, `zarr.Array`, `tiledb.Array`
            unstructured      `.fastq`, `.pdf`, `.vcf`, `.html`       ---               ---
            ================  ======================================  ================  ====================================================================

            You can map storage formats onto **R types**, e.g., an `AnnData` might be accessed via `anndataR`.

            Because `otype` accepts any `str`, you can define custom object types that enable queries & logic
            that you need, e.g., `"SingleCellExperiment"` or `"MyCustomZarrDataStructure"`.

            LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).

        .. dropdown:: Will artifacts get duplicated?

            If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact.
            Exception: empty files are not deduplicated by hash and create a new artifact.

            In concurrent workloads where the same artifact is created repeatedly at the exact same time, `.save()`
            detects the duplication and will return the existing artifact.

        .. dropdown:: I cannot come up with a good file name, can I avoid mapping artifacts into a hierarchy?

            Sometimes you want to **avoid mapping the artifact into a path hierarchy**. You can do so by omitting the `key` argument and only passing `description`.
            However, note that a shared `description` does not trigger mapping artifacts into the same version family.

                artifact = ln.Artifact("./my_folder", description="My folder").save()
                artifact_v2 = ln.Artifact("./my_folder", revises=old_artifact).save()  # need to version based on `revises`, a shared description does not trigger a new version

        .. dropdown:: Why does the constructor look the way it looks?

            It's inspired by APIs building on AWS S3.

            Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.

            In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::

                # signature: S3.Bucket.upload_file(filepath, key)
                import boto3
                s3 = boto3.resource('s3')
                bucket = s3.Bucket('mybucket')
                bucket.upload_file('/tmp/hello.txt', 'hello.txt')

            In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::

                # signature: quilt3.Bucket.put_file(key, filepath)
                import quilt3
                bucket = quilt3.Bucket('mybucket')
                bucket.put_file('hello.txt', '/tmp/hello.txt')

    See Also:
        :class:`~lamindb.Storage`
            Storage locations for artifacts.
        :class:`~lamindb.Collection`
            Collections of artifacts.
        :meth:`~lamindb.Artifact.from_dir`
            Bulk-create artifacts for each file in a directory.
        :meth:`~lamindb.Artifact.from_dataframe`
            Create an artifact from a `DataFrame`.
        :meth:`~lamindb.Artifact.from_anndata`
            Create an artifact from an `AnnData`.
        :meth:`~lamindb.Artifact.from_spatialdata`
            Create an artifact from a `SpatialData`.
        :meth:`~lamindb.Artifact.from_mudata`
            Create an artifact from a `MuData`.
        :meth:`~lamindb.Artifact.from_tiledbsoma`
            Create an artifact from a `tiledbsoma` store.
        :meth:`~lamindb.Artifact.from_lazy`
            Create a lazy artifact for streaming to auto-generated internal paths.

    """

    class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
        abstract = False
        app_label = "lamindb"
        constraints = [
            # a simple hard unique constraint on `hash` clashes with the fact
            # that pipelines sometimes aim to ingest the exact same file in different
            # folders
            # the conditional composite constraint allows duplicating files in different parts of the
            # file hierarchy, but errors if the same file is to be registered with the same key
            # In SQL, NULL values are treated specially in unique constraints.
            # Multiple NULL values are not considered equal to each other for uniqueness purposes.
            # For non-NULL keys
            models.UniqueConstraint(
                fields=["storage", "key", "hash"],
                condition=models.Q(key__isnull=False),
                name="unique_artifact_storage_key_hash_not_null",
            ),
            # For NULL keys (only storage + hash need to be unique)
            models.UniqueConstraint(
                fields=["storage", "hash"],
                condition=models.Q(key__isnull=True),
                name="unique_artifact_storage_hash_null_key",
            ),
        ]

    _TRACK_FIELDS = ("space_id", "is_latest", "suffix", "key")

    _len_full_uid: int = 20
    _len_stem_uid: int = 16
    _name_field: str = "key"

    @property
    def features(self) -> FeatureManager:
        """Feature manager.

        Annotate an artifact with features::

            artifact.features.set_values({
                "species": "human",
                "scientist": ['Barbara McClintock', 'Edgar Anderson'],
                "temperature": 27.6,
                "experiment": "Experiment 1"
            })

        Query artifacts by features::

            ln.Artifact.filter(scientist="Barbara McClintock")

        Get all feature annotations as a dictionary::

            d = artifact.features.get_values()

        Get a value for a single feature::

            organism = artifact.features["species"]  # returns an Organism object, not "human"
            temperature = artifact.features["temperature"]  # returns a temperature value, a float

        Note that `get_values()` returns identifiers for categorical values (for example, the string
        "human" for an `Organism`), while the `[]` accessor returns the corresponding Python object.
        See also :meth:`~lamindb.models.FeatureManager.set_values`.

        .. dropdown:: Dataset features vs. external features

            Features may or may not be stored in the dataset, i.e., the artifact content in storage.
            If you pass a schema to :class:`~lamindb.Artifact.from_dataframe` you validate the columns of the
            `DataFrame` and annotate with values parsed from these columns.
            `artifact.features.set_values()`, by contrast, does **not** validate the content of the artifact.

        """
        from ._feature_manager import FeatureManager

        return FeatureManager(self)

    @property
    def labels(self) -> LabelManager:
        """Label manager.

        A way to access all label annotations of an artifact, irrespective of their type.

        To annotate with labels, use the type-specific accessor,
        for example::

            experiment = ln.Record(name="Experiment 1").save()
            artifact.records.add(experiment)
            project = ln.Project(name="Project A").save()
            artifact.projects.add(project)
        """
        from ._label_manager import LabelManager

        return LabelManager(self)

    id: int = models.AutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(
        editable=False, unique=True, db_index=True, max_length=_len_full_uid
    )
    """A universal random id."""
    # the max length of 1024 equals the max length of a S3 key
    key: str | None = CharField(db_index=True, null=True, max_length=1024)
    """A (virtual) relative file path within the artifact's storage location.

    Setting a `key` is useful to automatically group artifacts into a version family.

    LaminDB defaults to a virtual file path to make renaming of data in object storage easy.

    If you register existing files in a storage location, the `key` equals the
    actual filepath on the underyling filesytem or object store.
    """
    _real_key: str | None = CharField(db_index=True, null=True, max_length=1024)
    """An optional real storage key."""
    # db_index on description because sometimes we query for equality in the case of artifacts
    description: str | None = TextField(null=True, db_index=True)
    """A description."""
    storage: Storage = ForeignKey(
        Storage, PROTECT, related_name="artifacts", editable=False
    )
    """Storage location, e.g. an S3 or GCP bucket or a local directory ← :attr:`~lamindb.Storage.artifacts`."""
    suffix: str = CharField(max_length=30, db_index=True, editable=False)
    # Initially, we thought about having this be nullable to indicate folders
    # But, for instance, .zarr is stored in a folder that ends with a .zarr suffix
    """The path suffix or an empty string if no suffix exists.

    This is either a file suffix (`".csv"`, `".h5ad"`, etc.) or the empty string "".
    """
    kind: ArtifactKind | str | None = CharField(
        max_length=20,
        db_index=True,
        null=True,
    )
    """:class:`~lamindb.base.types.ArtifactKind` or custom `str` value (default `None`)."""
    otype: (
        Literal["DataFrame", "AnnData", "MuData", "SpatialData", "tiledbsoma"]
        | str
        | None
    ) = CharField(max_length=64, db_index=True, null=True, editable=False)
    """The object type represented as a string.

    The field is automatically set when using the `from_dataframe()`, `from_anndata()`, ... constructors.
    Unstructured artifacts have `otype=None`.

    The field also accepts custom `str` values to allow for building logic around them in third-party packages.

    See section `storage formats & object types <storage-formats-note_>`__ for more background.
    """
    size: int | None = BigIntegerField(
        null=True, db_index=True, default=None, editable=False
    )
    """The size in bytes.

    Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc.
    """
    hash: str | None = CharField(
        max_length=HASH_LENGTH, db_index=True, null=True, editable=False
    )
    """The hash or pseudo-hash of the artifact content in storage.

    Useful to ascertain integrity and avoid duplication.

    Different versions of the artifact have different hashes.
    """
    n_files: int | None = BigIntegerField(
        null=True, db_index=True, default=None, editable=False
    )
    """The number of files for folder-like artifacts.

    Is `None` for file-like artifacts.

    Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.
    """
    n_observations: int | None = BigIntegerField(
        null=True, db_index=True, default=None, editable=False
    )
    """The number of observations in this artifact.

    Typically, this denotes the first array dimension.
    """
    _hash_type: str | None = CharField(
        max_length=30, db_index=True, null=True, editable=False
    )
    """Type of hash."""
    run: Run | None = ForeignKey(
        Run,
        PROTECT,
        related_name="output_artifacts",
        null=True,
        default=None,
        editable=False,
    )
    """The run that created the artifact ← :attr:`~lamindb.Run.output_artifacts`."""
    input_of_runs: RelatedManager[Run] = models.ManyToManyField(
        Run, related_name="input_artifacts"
    )
    """The runs that use this artifact as an input ← :attr:`~lamindb.Run.input_artifacts`."""
    recreating_runs: RelatedManager[Run] = models.ManyToManyField(
        "Run",
        related_name="recreated_artifacts",
    )
    """The runs that re-created the artifact after its initial creation ← :attr:`~lamindb.Run.recreated_artifacts`."""
    collections: RelatedManager[Collection]
    """The collections that this artifact is part of ← :attr:`~lamindb.Collection.artifacts`."""
    schema: Schema | None = ForeignKey(
        Schema,
        PROTECT,
        null=True,
        default=None,
        related_name="validated_artifacts",
    )
    """The validating schema of this artifact ← :attr:`~lamindb.Schema.validated_artifacts`.

    The validating schema is helpful to query artifacts that were validated by the same schema.
    """
    schemas: RelatedManager[Schema] = models.ManyToManyField(
        Schema, related_name="artifacts", through="ArtifactSchema"
    )
    """The inferred schemas of this artifact ← :attr:`~lamindb.Schema.artifacts`.

    The inferred schemas are helpful to answer the question: "Which features are present in the artifact?"

    The validating schema typically allows a range of valid actual dataset schemas.
    The inferred schemas link the actual schemas of the artifact, and are
    auto-generated by parsing the artifact content during validation.
    """
    json_values: RelatedManager[JsonValue] = models.ManyToManyField(
        JsonValue, through="ArtifactJsonValue", related_name="artifacts"
    )
    """The feature-indexed JSON values annotating this artifact ← :attr:`~lamindb.JsonValue.artifacts`."""
    _key_is_virtual: bool = BooleanField()
    """Indicates whether `key` is virtual or part of an actual file path."""
    # be mindful that below, passing related_name="+" leads to errors
    _actions: RelatedManager[Artifact] = models.ManyToManyField(
        "self", symmetrical=False, related_name="_action_targets"
    )
    """The actions to attach for the UI."""
    created_by: User = ForeignKey(
        "lamindb.User",
        PROTECT,
        default=current_user_id,
        related_name="created_artifacts",
        editable=False,
    )
    """The creator of this artifact ← :attr:`~lamindb.User.created_artifacts`."""
    _overwrite_versions: bool = BooleanField(default=None)
    """See corresponding property `overwrite_versions`."""
    ulabels: RelatedManager[ULabel]
    """The ulabels annotating this artifact ← :attr:`~lamindb.ULabel.artifacts`."""
    users: RelatedManager[User]
    """The users annotating this artifact ← :attr:`~lamindb.User.artifacts`."""
    projects: RelatedManager[Project]
    """The projects annotating this artifact ← :attr:`~lamindb.Project.artifacts`."""
    references: RelatedManager[Reference]
    """The references annotating this artifact ← :attr:`~lamindb.Reference.artifacts`."""
    records: RelatedManager[Record]
    """The records annotating this artifact ← :attr:`~lamindb.Record.artifacts`."""
    runs: RelatedManager[Run]
    """The runs annotating this artifact ← :attr:`~lamindb.Run.artifacts`."""
    linked_by_runs: RelatedManager[Run]
    """The runs linking this artifact ← :attr:`~lamindb.Run.linked_by_artifacts`."""
    artifacts: RelatedManager[Artifact] = models.ManyToManyField(
        "Artifact",
        through="ArtifactArtifact",
        symmetrical=False,
        related_name="linked_by_artifacts",
    )
    """The annotating artifacts of this artifact ← :attr:`~lamindb.Artifact.linked_by_artifacts`."""
    linked_by_artifacts: RelatedManager[Artifact]
    """The artifacts annotated by this artifact ← :attr:`~lamindb.Artifact.artifacts`."""
    linked_in_records: RelatedManager[Record] = models.ManyToManyField(
        "Record", through="RecordArtifact", related_name="linked_artifacts"
    )
    """The records linking this artifact as a feature value ← :attr:`~lamindb.Record.linked_artifacts`."""
    ablocks: RelatedManager[ArtifactBlock]
    """Attached blocks ← :attr:`~lamindb.ArtifactBlock.artifact`."""

    @overload
    def __init__(
        self,
        path: AnyPathStr,
        *,
        key: str | None = None,
        description: str | None = None,
        kind: ArtifactKind | str | None = None,
        features: dict[str, Any] | None = None,
        schema: Schema | None = None,
        revises: Artifact | None = None,
        overwrite_versions: bool | None = None,
        run: Run | False | None = None,
        storage: Storage | None = None,
        branch: Branch | None = None,
        space: Space | None = None,
        skip_hash_lookup: bool = False,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        # check whether we are called with db args
        if len(args) == len(self._meta.concrete_fields):
            super().__init__(*args, **kwargs)
            return None
        # now proceed with the user-facing constructor
        if len(args) > 1:
            raise ValueError("Only one non-keyword arg allowed: path")

        if "data" in kwargs:
            warnings.warn(
                "`data` argument was renamed to `path` and will be removed in a future release.",
                DeprecationWarning,
                stacklevel=2,
            )
            path = kwargs.pop("data")
        else:
            path = kwargs.pop("path") if len(args) == 0 else args[0]

        kind: str = kwargs.pop("kind", None)
        key: str | None = kwargs.pop("key", None)
        using_key = kwargs.pop("using_key", None)
        description: str | None = kwargs.pop("description", None)
        revises: Artifact | None = kwargs.pop("revises", None)
        if revises is not None:
            if not isinstance(revises, Artifact):
                raise TypeError("`revises` has to be of type `Artifact`")
            if description is None:
                description = revises.description
        overwrite_versions: bool | None = kwargs.pop("overwrite_versions", None)
        version_tag: str | None = kwargs.pop("version_tag", kwargs.pop("version", None))
        features: dict[str, Any] | None = kwargs.pop("features", None)
        skip_hash_lookup: bool = kwargs.pop("skip_hash_lookup", False)
        to_disk_kwargs: dict[str, Any] | None = kwargs.pop("to_disk_kwargs", None)
        format = kwargs.pop("format", None)
        _key_is_virtual = kwargs.pop("_key_is_virtual", None)
        _is_internal_call = kwargs.pop("_is_internal_call", False)
        skip_check_exists = kwargs.pop("skip_check_exists", False)

        if key is not None and _s().AUTO_KEY_PREFIX in key:
            raise ValueError(
                f"Do not pass key that contains a managed storage path in `{_s().AUTO_KEY_PREFIX}`"
            )
        # below is for internal calls that require defining the storage location
        # ahead of constructing the Artifact
        if isinstance(path, (str, Path, UPath)) and _s().AUTO_KEY_PREFIX in str(path):
            if _is_internal_call:
                if _key_is_virtual is False:
                    raise ValueError(
                        "Do not pass _key_is_virtual=False with _is_internal_call=True."
                    )
                is_automanaged_path = True
                user_provided_key = key
                key = None
            else:
                raise ValueError(
                    f"Do not pass path inside the `{_s().AUTO_KEY_PREFIX}` directory."
                )
        else:
            is_automanaged_path = False

        # validate external features if passed with a schema
        schema: Schema | None = _sqlrecord_or_id(
            Schema, kwargs.pop("schema", None), kwargs.pop("schema_id", None)
        )
        if features is not None:
            self._external_features = features
            if schema is not None:
                from lamindb.curators.core import ExperimentalDictCurator

                validation_schema = schema
                ExperimentalDictCurator(features, validation_schema).validate()
        # check_type is False because run can be False also, see get_run
        run: Run | None | bool = _sqlrecord_or_id(
            Run, kwargs.pop("run", None), kwargs.pop("run_id", None), check_type=False
        )
        branch: Branch | None = _sqlrecord_or_id(
            Branch, kwargs.pop("branch", None), kwargs.pop("branch_id", None)
        )
        space: Space | None = _sqlrecord_or_id(
            Space, kwargs.pop("space", None), kwargs.pop("space_id", None)
        )
        storage: Storage | None = _sqlrecord_or_id(
            Storage, kwargs.pop("storage", None), kwargs.pop("storage_id", None)
        )
        storage_was_passed = False
        if storage is not None:
            storage_was_passed = True
        elif (
            setup_settings.instance.keep_artifacts_local
            and setup_settings.instance._local_storage is not None
        ):
            storage = setup_settings.instance.local_storage.record
        else:
            storage = setup_settings.instance.storage.record
        if space is None:
            from lamindb import context as run_context

            if run_context.space is not None:
                space = run_context.space
            elif setup_settings.space is not None:
                space = setup_settings.space
        # space - storage consistency is also checked in .save() when the space is changed
        if space is not None and space.id != storage.space_id:
            if storage_was_passed:
                logger.warning(
                    "storage argument ignored as storage information from space takes precedence"
                )
            storage_locs_for_space = Storage.filter(
                space=space, instance_uid=setup_settings.instance.uid
            ).order_by("id")
            n_storage_locs_for_space = storage_locs_for_space.count()
            if n_storage_locs_for_space == 0:
                raise NoStorageLocationForSpace(
                    "No storage location found for space.\n"
                    "Either create one via ln.Storage(root='create-s3', space=space).save()\n"
                    "Or start managing access to an existing storage location via the space: storage_loc.space = space; storage.save()"
                )
            else:
                storage = storage_locs_for_space.first()
                if n_storage_locs_for_space > 1:
                    other_storage_locs = ",".join(
                        f"{s.root}" for s in storage_locs_for_space[1:]
                    )
                    logger.warning(
                        f"more than one storage location is managed by this instance for space {space},\n"
                        f"choosing root={storage.root}\n"
                    )
                    logger.important_hint(
                        f"to choose one of the other storage locations ({other_storage_locs}), pass `storage` to the Artifact constructor"
                    )
        otype = kwargs.pop("otype") if "otype" in kwargs else None
        if isinstance(path, str) and path.startswith("s3:///"):
            # issue in Groovy / nf-lamin producing malformed S3 paths
            # https://laminlabs.slack.com/archives/C08J590666Q/p1751315027830849?thread_ts=1751039961.479259&cid=C08J590666Q
            path = path.replace("s3:///", "s3://")
        otype = check_otype_artifact(
            data=path, otype=otype, cloud_warning=not _is_internal_call
        )
        if "type" in kwargs:
            logger.warning("`type` will be removed soon, please use `kind`")
            kind = kwargs.pop("type")
        if not len(kwargs) == 0:
            valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)])
            raise FieldValidationError(
                f"Only {valid_keywords} can be passed, you passed: {kwargs}"
            )
        if revises is not None and key is not None and revises.key != key:
            logger.warning(f"renaming artifact from '{revises.key}' to {key}")

        provisional_uid, revises = create_uid(revises=revises, version_tag=version_tag)
        run = get_run(run)
        kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
            data=path,
            key=key,
            run=run,
            format=format,
            provisional_uid=provisional_uid,
            version_tag=version_tag,
            storage=storage,
            using_key=using_key,
            skip_check_exists=skip_check_exists,
            overwrite_versions=overwrite_versions,
            skip_hash_lookup=skip_hash_lookup,
            to_disk_kwargs=to_disk_kwargs,
            key_is_virtual=_key_is_virtual,
        )

        def set_private_attributes():
            if path is not None and "local_filepath" in privates:
                self._local_filepath = privates["local_filepath"]
                self._cloud_filepath = privates["cloud_filepath"]
                self._memory_rep = privates["memory_rep"]
                self._to_store = not privates["check_path_in_storage"]

                if (
                    self._to_store
                    and not privates["is_artifact_storage_managed_by_current_instance"]
                ):
                    raise ValueError(
                        "Cannot create an artifact in a storage location that is not managed by the current instance."
                    )

        # an object with the same hash already exists
        if isinstance(kwargs_or_artifact, Artifact):
            from .sqlrecord import init_self_from_db, update_attributes

            init_self_from_db(self, kwargs_or_artifact)
            # update key from inferred value
            key = privates.pop("key")
            # adding "key" here is dangerous because key might be auto-populated
            attr_to_update = {"description": description}
            if schema is not None:
                attr_to_update["schema"] = schema
            if kwargs_or_artifact._key_is_virtual and kwargs_or_artifact.key is None:
                attr_to_update["key"] = key
            elif self.key != key and key is not None:
                if not self.path.exists():
                    logger.warning(f"updating previous key {self.key} to new key {key}")
                    self.key = key
                    # Keep tracked state aligned with this internal dedup-time key
                    # normalization so save() doesn't treat it as a user key edit.
                    self._original_values["key"] = key
                    assert self.path.exists(), (  # noqa: S101
                        f"The underlying file for artifact {self} does not exist anymore, clean up the artifact record."
                    )  # noqa: S101
                else:
                    logger.warning(
                        f"key {self.key} on existing artifact differs from passed key {key}, keeping original key; update manually if needed or pass skip_hash_lookup if you want to duplicate the artifact"
                    )
            update_attributes(self, attr_to_update)
            # an existing artifact might have an imcomplete upload and hence we should
            # re-populate _local_filepath because this is what triggers the upload
            set_private_attributes()
            populate_subsequent_run(self, run)
            return None
        else:
            kwargs = kwargs_or_artifact
            kwargs["schema"] = schema

        if revises is None:
            revises = kwargs_or_artifact.pop("revises")

        set_private_attributes()

        if is_automanaged_path and _is_internal_call:
            kwargs["_key_is_virtual"] = True
            assert _s().AUTO_KEY_PREFIX in kwargs["key"]  # noqa: S101
            uid = (
                kwargs["key"]
                .replace(_s().AUTO_KEY_PREFIX, "")
                .replace(kwargs["suffix"], "")
            )
            kwargs["key"] = user_provided_key
            if revises is not None:
                assert uid.startswith(revises.stem_uid)  # noqa: S101
            if len(uid) == 16:
                if revises is None:
                    uid += "0000"
                else:
                    uid, revises = create_uid(revises=revises, version_tag=version_tag)
            kwargs["uid"] = uid

        # only set key now so that we don't perform a look-up on it in case revises is passed
        if revises is not None and revises.key is not None and kwargs["key"] is None:
            kwargs["key"] = revises.key

        kwargs["kind"] = kind
        kwargs["version_tag"] = version_tag
        kwargs["description"] = description
        kwargs["branch"] = branch
        kwargs["space"] = space
        kwargs["otype"] = otype
        kwargs["revises"] = revises
        # this check needs to come down here because key might be populated from an
        # existing file path during get_artifact_kwargs_from_data()
        if (
            kwargs["key"] is None
            and kwargs["description"] is None
            and kwargs["run"] is None
        ):
            raise ValueError("Pass one of key, run or description as a parameter")

        super().__init__(**kwargs)

    @property
    def transform(self) -> Transform | None:
        """Transform whose run created the artifact."""
        return self.run.transform if self.run is not None else None

    @property
    def overwrite_versions(self) -> bool:
        """Indicates whether to keep or overwrite versions.

        It defaults to `False` for file-like artifacts and to `True` for folder-like artifacts.

        Note that this requires significant storage space for large folders with
        many duplicated files. Currently, `lamindb` does *not* de-duplicate files across
        versions as in git, but keeps all files for all versions of the folder in storage.
        """
        return self._overwrite_versions

    @property
    def _storage_ongoing(self) -> bool:
        """Whether the artifact is still in the process of being saved to storage (uploaded for cloud storage).

        - `True`: write started but not completed
        - `False`: storage completed or not yet started

        In the JSON `_aux`field, `True` is represented as `{"so": 1}` and `False` as
        an absent `"so"` key.
        """
        if self._aux is None:
            return False
        if self._aux.get("so") == 1:
            return True
        else:
            return False

    @_storage_ongoing.setter
    def _storage_ongoing(self, value: bool | None) -> None:
        if value is None or value is False:
            if self._aux is not None and "so" in self._aux:
                del self._aux["so"]
                if not self._aux:
                    self._aux = None
        else:
            if self._aux is None:
                self._aux = {}
            assert value is True
            self._aux["so"] = 1

    @property
    @deprecated("schemas")
    def feature_sets(self):
        return self.schemas

    @property
    def path(self) -> UPath:
        """Path.

        Example::

            import lamindb as ln

            # File in cloud storage, here AWS S3:
            artifact = ln.Artifact("s3://my-bucket/my-file.csv").save()
            artifact.path
            #S3QueryPath('s3://my-bucket/my-file.csv')

            # File in local storage:
            ln.Artifact("./myfile.csv", key="myfile.csv").save()
            artifact.path
            #> PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/myfile.csv')
        """
        filepath, _ = _s().filepath_from_artifact(self, using_key=settings._using_key)
        return filepath

    @property
    def _cache_path(self) -> UPath:
        filepath, cache_key = _s().filepath_cache_key_from_artifact(
            self, using_key=settings._using_key
        )
        if isinstance(filepath, LocalPathClasses):
            return filepath
        return setup_settings.paths.cloud_to_local_no_update(
            filepath, cache_key=cache_key
        )

    @strict_classmethod
    def get(
        cls,
        idlike: int | str | None = None,
        *,
        key: str | None = None,
        path: AnyPathStr | None = None,
        is_run_input: bool | Run = False,
        **expressions,
    ) -> Artifact:
        """Get a single artifact.

        Args:
            idlike: Either a uid stub, uid or an integer id.
            key: An optional key to query for.
            path: An optional full path to query for, including the storage root.
            is_run_input: Whether to track this artifact as run input.
            expressions: Other fields and values passed as Django query expressions.

        Raises:
            :exc:`lamindb.errors.DoesNotExist`: In case no matching record is found.

        See Also:
            - Guide: :doc:`registries`
            - Method in `SQLRecord` base class: :meth:`~lamindb.models.SQLRecord.get`

        Examples:

            ::

                artifact = ln.Artifact.get("tCUkRcaEjTjhtozp")       # gets latest version for family tCUkRcaEjTjhtozp
                artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0005")   # gets version 0005 for family tCUkRcaEjTjhtozp
                artifact = ln.Artifact.get(key="examples/my_file.parquet")               # gets latest version for a key
                artifact = ln.Artifact.get(key="examples/my_file.parquet", version="2")  # pass a version tag
                artifact = ln.Artifact.get(path="s3://bucket/folder/adata.h5ad")
        """
        if key is not None:
            expressions["key"] = key
        if path is not None:
            expressions["path"] = path
        return QuerySet(model=cls).get(idlike, is_run_input=is_run_input, **expressions)

    @strict_classmethod
    def filter(
        cls,
        *queries,
        **expressions,
    ) -> QuerySet:
        """Query a set of artifacts.

        Args:
            *queries: `Q` expressions.
            **expressions: Features & fields via the Django query syntax.

        See Also:
            - Guide: :doc:`docs:registries`

        Examples:

            Query by fields::

                ln.Arfifact.filter(key="examples/my_file.parquet")

            Query by features::

                ln.Arfifact.filter(cell_type_by_model__name="T cell")

        """
        # from Registry metaclass
        return type(cls).filter(cls, *queries, **expressions)

    @classmethod
    def from_lazy(
        cls,
        suffix: str,
        overwrite_versions: bool,
        key: str | None = None,
        description: str | None = None,
        run: Run | None = None,
        **kwargs,
    ) -> LazyArtifact:
        """Create a lazy artifact for streaming to auto-generated internal paths.

        This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
        and register the path as an artifact. It allows writing directly into the default cloud
        (or local) storage of the current instance and then saving as an :class:`~lamindb.Artifact`.

        The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact
        on `.save()` with the provided arguments.

        Args:
            suffix: The suffix for the auto-generated internal path
            overwrite_versions: Whether to overwrite versions.
            key: An optional key to reference the artifact.
            description: A description.
            run: The run that creates the artifact.
            **kwargs: Other keyword arguments for the artifact to be created.

        Examples:

            Local storage: create a lazy artifact, stream to the path, then save::

                lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
                zarr.open(lazy.path, mode="w")["test"] = np.array(["test"])
                artifact = lazy.save()

            Cloud storage (e.g. S3): use `zarr.storage.FsspecStore` to stream arrays::

                lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
                store = zarr.storage.FsspecStore.from_url(lazy.path.as_posix())
                group = zarr.open(store, mode="w")
                group["ones"] = np.ones(3)
                artifact = lazy.save()
        """
        args = {"key": key, "description": description, "run": run, **kwargs}
        return LazyArtifact(suffix, overwrite_versions, **args)

    @classmethod
    def from_dataframe(
        cls,
        df: pd.DataFrame | AnyPathStr,
        *,
        key: str | None = None,
        description: str | None = None,
        run: Run | None = None,
        revises: Artifact | None = None,
        schema: Schema | Literal["valid_features"] | None = None,
        features: dict[str, Any] | None = None,
        parquet_kwargs: dict[str, Any] | None = None,
        csv_kwargs: dict[str, Any] | None = None,
        **kwargs,
    ) -> Artifact:
        """Create from `DataFrame`, optionally validate & annotate.

        Sets `.otype` to `"DataFrame"` and populates `.n_observations`.

        Args:
            df: A `DataFrame` object or an `AnyPathStr` pointing to a `DataFrame` in storage, e.g. a `.parquet` or `.csv` file.
            key: A relative path within default storage, e.g., `"myfolder/myfile.parquet"`.
            description: A description.
            revises: An old version of the artifact.
            run: The run that creates the artifact.
            schema: A schema that defines how to validate & annotate.
            features: Additional external features to annotate the artifact via :class:`~lamindb.models.FeatureManager.set_values` (keys can be feature names or `Feature` objects).
            parquet_kwargs: Additional keyword arguments passed to the
                `pandas.DataFrame.to_parquet` method, which are passed
                on to `pyarrow.parquet.ParquetWriter`.
            csv_kwargs: Additional keyword arguments passed to the `pandas.DataFrame.to_csv` method.

        Examples:

            No validation and annotation::

                ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet").save()

            With validation and annotation::

                ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet", schema="valid_features").save()

            Under-the-hood, this uses the following build-in schema (:func:`~lamindb.examples.schemas.valid_features`)::

                schema = ln.Schema(name="valid_features", itype="Feature").save()

            External features:

            .. literalinclude:: scripts/curate_dataframe_external_features.py
               :language: python

            Parquet kwargs:

            .. literalinclude:: scripts/test_artifact_parquet.py
               :language: python
        """
        if "format" not in kwargs and key is not None and key.endswith(".csv"):
            kwargs["format"] = ".csv"
        if schema == "valid_features":
            from lamindb import examples

            schema = examples.schemas.valid_features()

        to_disk_kwargs: dict[str, Any] = parquet_kwargs or csv_kwargs
        artifact = Artifact(  # type: ignore
            path=df,
            key=key,
            run=run,
            description=description,
            revises=revises,
            otype="DataFrame",
            kind="dataset",
            to_disk_kwargs=to_disk_kwargs,
            **kwargs,
        )
        if data_is_dataframe(df):
            artifact.n_observations = len(df)
        else:
            # must be a str or path
            path = create_path(df)
            if path.suffix == ".parquet":
                import pyarrow.parquet as pq

                with path.open("rb") as f:
                    artifact.n_observations = pq.read_metadata(f).num_rows
            else:
                # csv/tsv/others have no metadata and would require a full expensive read
                artifact.n_observations = None
        if features is not None:
            artifact._external_features = features
        if schema is not None:
            from lamindb.curators.core import DataFrameCurator

            if not artifact._state.adding and artifact.suffix != ".parquet":
                logger.warning(
                    f"not re-validating existing artifact as it was stored as {artifact.suffix}, "
                    "which does not maintain categorical dtype information"
                )
                return artifact

            curator = DataFrameCurator(artifact, schema, features=features)
            curator.validate()
            artifact.schema = schema
            artifact._curator = curator
        return artifact

    @classmethod
    @deprecated("from_dataframe")
    def from_df(
        cls,
        df: pd.DataFrame,
        *,
        key: str | None = None,
        description: str | None = None,
        run: Run | None = None,
        revises: Artifact | None = None,
        schema: Schema | None = None,
        **kwargs,
    ) -> Artifact:
        return cls.from_dataframe(
            df,
            key=key,
            description=description,
            run=run,
            revises=revises,
            schema=schema,
            **kwargs,
        )

    @classmethod
    def from_anndata(
        cls,
        adata: Union[AnnData, AnyPathStr],
        *,
        key: str | None = None,
        description: str | None = None,
        run: Run | None = None,
        revises: Artifact | None = None,
        schema: Schema
        | Literal["ensembl_gene_ids_and_valid_features_in_obs"]
        | None = None,
        format: Literal["h5ad", "zarr", "anndata.zarr"] | None = None,
        h5ad_kwargs: dict[str, Any] | None = None,
        zarr_kwargs: dict[str, Any] | None = None,
        **kwargs,
    ) -> Artifact:
        """Create from `AnnData`, optionally validate & annotate.

        Sets `.otype` to `"AnnData"` and populates `.n_observations`.

        Args:
            adata: An `AnnData` object or a path of AnnData-like.
            key: A relative path within default storage, e.g., `"myfolder/myfile.h5ad"`.
            description: A description.
            revises: An old version of the artifact.
            run: The run that creates the artifact.
            schema: A schema that defines how to validate & annotate.
            format: Storage format used when writing in-memory `AnnData`.
                In-memory `AnnData` is first written to cache in this format, then saved to instance storage when calling `.save()`.
                If `None`, infer from `key` suffix when available, otherwise default to `"h5ad"`.
                If provided, suffix is formed as `"." + format` (e.g., `"zarr"` -> `".zarr"`).
            h5ad_kwargs: Additional keyword arguments passed to the `anndata.AnnData.write_h5ad` method
                when writing in-memory `AnnData` to cache.
            zarr_kwargs: Additional keyword arguments passed to the `anndata.AnnData.write_zarr` method.
                when writing in-memory `AnnData` to cache. Use `key` with suffix `.zarr` or pass `format="zarr"` for this to work.

        See Also:
            :meth:`~lamindb.Collection`
                Track collections.
            :class:`~lamindb.Feature`
                Track features.

        Example:

            Write H5AD with custom serialization settings::

                ln.Artifact.from_anndata(
                    adata,
                    key="examples/dataset1.h5ad",
                    h5ad_kwargs={"compression": "gzip"},
                ).save()

            Write Zarr with custom chunking settings::

                ln.Artifact.from_anndata(
                    adata,
                    key="examples/dataset1.zarr",
                    format="zarr",
                    zarr_kwargs={"chunks": [1024, 1024]},
                ).save()

            No validation and annotation::

                ln.Artifact.from_anndata(adata, key="examples/dataset1.h5ad").save()

            With validation and annotation::

                ln.Artifact.from_anndata(adata, key="examples/dataset1.h5ad", schema="ensembl_gene_ids_and_valid_features_in_obs").save()

            Under-the-hood, this uses the following build-in schema (:func:`~lamindb.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs`):

            .. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
               :language: python

            This schema tranposes the `var` DataFrame during curation, so that one validates and annotates the columns of `var.T`, i.e., `[ENSG00000153563, ENSG00000010610, ENSG00000170458]`.
            If one doesn't transpose, one would annotate the columns of `var`, i.e., `[gene_symbol, gene_type]`.

            .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/gLyfToATM7WUzkWW0001.png
               :width: 800px
        """
        if not data_is_scversedatastructure(adata, "AnnData"):
            raise ValueError(
                "data has to be an AnnData object or a path to AnnData-like"
            )

        if schema == "ensembl_gene_ids_and_valid_features_in_obs":
            from lamindb import examples

            schema = (
                examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs()
            )

        to_disk_kwargs: dict[str, Any] = h5ad_kwargs or zarr_kwargs
        artifact = Artifact(  # type: ignore
            path=adata,
            key=key,
            run=run,
            description=description,
            revises=revises,
            otype="AnnData",
            kind="dataset",
            format=format,
            to_disk_kwargs=to_disk_kwargs,
            **kwargs,
        )
        # this is done instead of _anndata_n_observations(adata)
        # because we need a proper path through create_path for cloud paths
        # for additional upath options etc that create_path adds
        obj_for_obs: AnnData | UPath
        if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None:
            obj_for_obs = artifact._memory_rep
        else:
            # returns ._local_filepath for local files
            # and the proper path through create_path for cloud paths
            obj_for_obs = artifact.path
        from ..core.storage._anndata_accessor import _anndata_n_observations

        artifact.n_observations = _anndata_n_observations(obj_for_obs)
        if schema is not None:
            from ..curators import AnnDataCurator

            curator = AnnDataCurator(artifact, schema)
            curator.validate()
            artifact.schema = schema
            artifact._curator = curator
        return artifact

    @classmethod
    def from_mudata(
        cls,
        mdata: Union[MuData, AnyPathStr],
        *,
        key: str | None = None,
        description: str | None = None,
        run: Run | None = None,
        revises: Artifact | None = None,
        schema: Schema | None = None,
        **kwargs,
    ) -> Artifact:
        """Create from `MuData`, optionally validate & annotate.

        Sets `.otype` to `"MuData"`.

        Args:
            mdata: A `MuData` object.
            key: A relative path within default storage, e.g., `"myfolder/myfile.h5mu"`.
            description: A description.
            revises: An old version of the artifact.
            run: The run that creates the artifact.
            schema: A schema that defines how to validate & annotate.

        See Also:
            :meth:`~lamindb.Collection`
                Track collections.
            :class:`~lamindb.Feature`
                Track features.

        Example::

            import lamindb as ln

            mdata = ln.examples.datasets.mudata_papalexi21_subset()
            artifact = ln.Artifact.from_mudata(mdata, key="mudata_papalexi21_subset.h5mu").save()
        """
        if not data_is_scversedatastructure(mdata, "MuData"):
            raise ValueError("data has to be a MuData object or a path to MuData-like")
        artifact = Artifact(  # type: ignore
            path=mdata,
            key=key,
            run=run,
            description=description,
            revises=revises,
            otype="MuData",
            kind="dataset",
            **kwargs,
        )
        if not isinstance(mdata, (str, Path, UPath)):
            artifact.n_observations = mdata.n_obs
        if schema is not None:
            from ..curators import MuDataCurator

            curator = MuDataCurator(artifact, schema)
            curator.validate()
            artifact.schema = schema
            artifact._curator = curator
        return artifact

    @classmethod
    def from_spatialdata(
        cls,
        sdata: SpatialData | AnyPathStr,
        *,
        key: str | None = None,
        description: str | None = None,
        run: Run | None = None,
        revises: Artifact | None = None,
        schema: Schema | None = None,
        **kwargs,
    ) -> Artifact:
        """Create from `SpatialData`, optionally validate & annotate.

        Sets `.otype` to `"SpatialData"`.

        Args:
            sdata: A `SpatialData` object.
            key: A relative path within default storage, e.g., `"myfolder/myfile.zarr"`.
            description: A description.
            revises: An old version of the artifact.
            run: The run that creates the artifact.
            schema: A schema that defines how to validate & annotate.

        See Also:
            :meth:`~lamindb.Collection`
                Track collections.
            :class:`~lamindb.Feature`
                Track features.

        Example:

            No validation and annotation::

                import lamindb as ln

                artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr").save()

            With validation and annotation. First, find a `SpatialData` schema, e.g.::

                ln.Schema.filter(otype="SpatialData").to_dataframe()
                schema = ln.Schema.get(name="spatialdata_blobs_schema")

            Then, pass the schema to the `from_spatialdata` method::

                artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr", schema=schema).save()

            You can also define a schema from scratch:

            .. literalinclude:: scripts/define_schema_spatialdata.py
                :language: python

        """
        if not data_is_scversedatastructure(sdata, "SpatialData"):
            raise ValueError(
                "data has to be a SpatialData object or a path to SpatialData-like"
            )
        artifact = Artifact(  # type: ignore
            path=sdata,
            key=key,
            run=run,
            description=description,
            revises=revises,
            otype="SpatialData",
            kind="dataset",
            **kwargs,
        )
        # ill-defined https://scverse.zulipchat.com/#narrow/channel/315824-spatial/topic/How.20to.20calculate.20the.20number.20of.20observations.3F
        # artifact.n_observations = ...
        if schema is not None:
            from ..curators import SpatialDataCurator

            curator = SpatialDataCurator(artifact, schema)
            curator.validate()
            artifact.schema = schema
            artifact._curator = curator
        return artifact

    @classmethod
    def from_tiledbsoma(
        cls,
        exp: SOMAExperiment | AnyPathStr,
        *,
        key: str | None = None,
        description: str | None = None,
        run: Run | None = None,
        revises: Artifact | None = None,
        **kwargs,
    ) -> Artifact:
        """Create from a `tiledbsoma.Experiment` store.

        Sets `.otype` to `"tiledbsoma"` and populates `.n_observations`.

        Args:
            exp: TileDB-SOMA Experiment object or path to Experiment store.
            key: A relative path within default storage, e.g., `"myfolder/mystore.tiledbsoma"`.
            description: A description.
            revises: An old version of the artifact.
            run: The run that creates the artifact.

        Example::

            import lamindb as ln

            artifact = ln.Artifact.from_tiledbsoma("s3://mybucket/store.tiledbsoma", description="a tiledbsoma store").save()
        """
        if not data_is_soma_experiment(exp):
            raise ValueError(
                "data has to be a SOMA Experiment object or a path to SOMA Experiment store."
            )

        # SOMAExperiment.uri may have file:// prefix for local paths which needs stripping for filesystem access.
        # Other URI schemes (s3://, etc.) are preserved and supported.
        exp = (
            exp.uri.removeprefix("file://")
            if not isinstance(exp, (str, Path, UPath))
            else exp
        )

        artifact = Artifact(  # type: ignore
            path=exp,
            key=key,
            run=run,
            description=description,
            revises=revises,
            otype="tiledbsoma",
            kind="dataset",
            **kwargs,
        )
        from ..core.storage._tiledbsoma import _soma_n_observations

        artifact.n_observations = _soma_n_observations(artifact.path)
        return artifact

    @classmethod
    def from_dir(
        cls,
        path: AnyPathStr,
        *,
        key: str | None = None,
        run: Run | None = None,
    ) -> SQLRecordList:
        """Create a list of :class:`~lamindb.Artifact` objects from a directory.

        Hint:
            If you have a high number of files (several 100k) and don't want to
            track them individually, create a single :class:`~lamindb.Artifact` via
            ``Artifact(path)`` for them. See, e.g., :doc:`docs:rxrx`.

        Args:
            path: Source path of folder.
            key: Key for storage destination.
                If `None` and directory is in a registered location, the inferred `key` will reflect the relative position.
                If `None` and directory is outside of a registered storage location, the inferred key defaults to `path.name`.
            run: A `Run` object.

        Example::

            import lamindb as ln

            dir_path = ln.examples.datasets.dir_scrnaseq_cellranger("sample_001", ln.settings.storage)
            ln.Artifact.from_dir(dir_path).save()  # creates one artifact per file in dir_path
        """
        folderpath: UPath = create_path(path)  # returns Path for local
        storage = settings.storage.record
        using_key = settings._using_key
        storage, use_existing_storage = process_pathlike(folderpath, storage, using_key)
        folder_key_path: PurePath | Path
        if key is None:
            if not use_existing_storage:
                logger.warning(
                    "folder is outside existing storage location, will copy files from"
                    f" {path} to {storage.root}/{folderpath.name}"
                )
                folder_key_path = Path(folderpath.name)
            else:
                # maintain the hierachy within an existing storage location
                folder_key_path = get_relative_path_to_directory(
                    folderpath, UPath(storage.root)
                )
        else:
            folder_key_path = Path(key)

        folder_key = folder_key_path.as_posix()
        # silence fine-grained logging
        verbosity = settings.verbosity
        verbosity_int = settings._verbosity_int
        if verbosity_int >= 1:
            settings.verbosity = "warning"
        artifacts_dict = {}
        for filepath in folderpath.rglob("*"):
            if filepath.is_file():
                relative_path = get_relative_path_to_directory(filepath, folderpath)
                artifact_key = folder_key + "/" + relative_path.as_posix()
                # if creating from rglob, we don't need to check for existence
                artifact = Artifact(
                    filepath, run=run, key=artifact_key, skip_check_exists=True
                )
                artifacts_dict[artifact.uid] = artifact
        settings.verbosity = verbosity

        # run sanity check on hashes
        hashes = [
            artifact.hash
            for artifact in artifacts_dict.values()
            if artifact.hash is not None
        ]
        uids = artifacts_dict.keys()
        n_unique_hashes = len(set(hashes))
        if n_unique_hashes == len(hashes):
            artifacts = SQLRecordList(artifacts_dict.values())
        else:
            # consider exact duplicates (same id, same hash)
            # below can't happen anymore because artifacts is a dict now
            # if len(set(uids)) == len(set(hashes)):
            #     logger.warning("dropping duplicate records in list of artifact records")
            #     artifacts = list(set(uids))
            # consider false duplicates (different id, same hash)
            if not len(set(uids)) == n_unique_hashes:
                seen_hashes = set()
                non_unique_artifacts = {
                    hash: artifact
                    for hash, artifact in artifacts_dict.items()
                    if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash)  # type: ignore
                }
                display_non_unique = "\n    ".join(
                    f"{artifact}" for artifact in non_unique_artifacts
                )
                logger.warning(
                    "there are multiple artifact uids with the same hashes, dropping"
                    f" {len(non_unique_artifacts)} duplicates out of"
                    f" {len(artifacts_dict)} artifacts:\n    {display_non_unique}"
                )
                artifacts = SQLRecordList(
                    [
                        artifact
                        for artifact in artifacts_dict.values()
                        if artifact not in non_unique_artifacts.values()
                    ]
                )
        logger.success(
            f"created {len(artifacts)} artifacts from directory using storage"
            f" {storage.root} and key = {folder_key}/"
        )
        return artifacts

    def replace(
        self,
        data: Union[AnyPathStr, pd.DataFrame, AnnData, MuData],
        run: Run | bool | None = None,
        format: str | None = None,
    ) -> None:
        """Replace the artifact content in storage **without** making a new version.

        **Note:** If you want to create a new version, do **not** use the `.replace()` method but rather any `Artifact` constructor.

        Args:
            data: A file path or in-memory dataset object like a `DataFrame`, `AnnData`, `MuData`, or `SpatialData`.
            run: `Run | bool | None = None` The run that creates the artifact.
                If `False`, suppress tracking the run.
                If `None`, infer the run from the global run context.
            format: `str | None = None` The format of the data to write into storage.
                If `None`, infer the format from the data.

        Example:

            Query a text file and replace its content::

                artifact = ln.Artifact.get(key="my_file.txt")
                artifact.replace("./my_new_file.txt")
                artifact.save()

            Note that you need to call `.save()` to persist the changes in storage.
        """
        storage = settings.storage.record
        run = get_run(run)
        kwargs, privates = get_artifact_kwargs_from_data(
            provisional_uid=self.uid,
            data=data,
            key=self.key,
            run=run,
            format=format,
            storage=storage,
            version_tag=None,
            is_replace=True,
        )

        # this artifact already exists
        if isinstance(kwargs, Artifact):
            return kwargs

        check_path_in_storage = privates["check_path_in_storage"]
        if check_path_in_storage:
            err_msg = (
                "Can only replace with a local path not in any Storage. "
                f"This data is in {Storage.objects.get(id=kwargs['storage_id'])}."
            )
            raise ValueError(err_msg)

        _overwrite_versions = kwargs["_overwrite_versions"]
        if self._overwrite_versions != _overwrite_versions:
            err_msg = "It is not allowed to replace "
            err_msg += "a folder" if self._overwrite_versions else "a file"
            err_msg += " with " + ("a folder." if _overwrite_versions else "a file.")
            raise ValueError(err_msg)

        new_suffix = kwargs["suffix"]
        if new_suffix != self.suffix:
            key = self.key
            real_key = self._real_key
            if key is not None:
                new_key = PurePosixPath(key).with_suffix(new_suffix).as_posix()
            else:
                new_key = None
            if (key is not None and not self._key_is_virtual) or real_key is not None:
                # real_key is not None implies key is not None
                assert key is not None  # noqa: S101
                if real_key is not None:
                    self._clear_storagekey = real_key
                    self._real_key = (
                        PurePosixPath(real_key).with_suffix(new_suffix).as_posix()
                    )
                    warn_msg = f", _real_key '{real_key}' with '{self._real_key}'"
                else:
                    self._clear_storagekey = key
                    warn_msg = ""
                self.key = new_key
                self._original_values["key"] = new_key
                logger.warning(
                    f"replacing the file will replace key '{key}' with '{new_key}'{warn_msg}"
                    f" and delete '{self._clear_storagekey}' upon `save()`"
                )
            else:
                # purely virtual key case
                self._clear_storagekey = _s().auto_storage_key_from_artifact(self)
                # might replace None with None, not a big deal
                self.key = new_key
                self._original_values["key"] = new_key

        self.suffix = new_suffix
        self.size = kwargs["size"]
        self.hash = kwargs["hash"]
        self._hash_type = kwargs["_hash_type"]
        self.run_id = kwargs["run_id"]
        self.run = kwargs["run"]
        self.n_files = kwargs["n_files"]

        self._local_filepath = privates["local_filepath"]
        self._cloud_filepath = privates["cloud_filepath"]
        self._memory_rep = privates["memory_rep"]
        # no need to upload if new file is already in storage
        self._to_store = not check_path_in_storage

        # update old suffix with the new one so that the check in artifact save pass
        # replace() supports changing the suffix
        self._original_values["suffix"] = self.suffix

    def open(
        self,
        mode: str = "r",
        engine: Literal["pyarrow", "polars"] = "pyarrow",
        is_run_input: bool | None = None,
        **kwargs,
    ) -> (
        PyArrowDataset
        # PolarsLazyFrame does not implement the context manager protocol hence we need `Iterator` in the type annotation
        | Iterator[
            PolarsLazyFrame
        ]  # note that intersphinx doesn't work for this, hence manual docs link: https://github.com/laminlabs/lamindb/issues/2736#issuecomment-3703889524
        | AnnDataAccessor  # AnnDataAccessor implements the context manager protocol
        | SpatialDataAccessor
        | BackedAccessor
        | SOMACollection
        | SOMAExperiment
        | SOMAMeasurement
    ):
        """Open a dataset for streaming.

        Works for the following object types (storage formats):

        - `DataFrame` (`.parquet`, `.csv`, `.ipc` files or directories with such files)
        - `AnnData` (`.h5ad`, `.zarr`)
        - `SpatialData` (`.zarr`)
        - `tiledbsoma` (`.tiledbsoma`)
        - generic arrays (`.h5`, `.zarr`)

        Args:
            mode: can be `"r"` or `"w"` (write mode) for `tiledbsoma` stores,
                `"r"` or `"r+"` for `AnnData` or `SpatialData` `zarr` stores,
                otherwise should be always `"r"` (read-only mode).
            engine: Which module to use for lazy loading of a dataframe
                from `pyarrow` or `polars` compatible formats.
                This has no effect if the artifact is not a dataframe, i.e.
                if it is an `AnnData,` `hdf5`, `zarr`, `tiledbsoma` object etc.
            is_run_input: Whether to track this artifact as run input.
            **kwargs: Keyword arguments for the accessor, i.e. `h5py` or `zarr` connection,
                `pyarrow.dataset.dataset`, `polars.scan_*` function.

        Returns:
            Streaming accessors, in particular,
            a :class:`pyarrow:pyarrow.dataset.Dataset` object,
            a context manager yielding a `polars.LazyFrame <https://docs.pola.rs/api/python/stable/reference/lazyframe/>`__,
            and objects of type :class:`~lamindb.core.storage.AnnDataAccessor`, :class:`~lamindb.core.storage.SpatialDataAccessor`, :class:`~lamindb.core.storage.BackedAccessor`,
            :class:`tiledbsoma:tiledbsoma.Collection`, :class:`tiledbsoma.Experiment`, :class:`tiledbsoma.Measurement`.

        Note:
            For TileDB-SOMA stores on S3 with federated credentials,
            credentials are updated only when the storage is opened, not while the
            store handle is held open. If credentials expire during a long-lived
            session, close the store and open it again to refresh.

        Examples:

            Open a `DataFrame`-like artifact via :class:`pyarrow:pyarrow.dataset.Dataset`::

                artifact = ln.Artifact.get(key="sequences/mydataset.parquet")
                artifact.open()
                #> pyarrow._dataset.FileSystemDataset

            Open a `DataFrame`-like artifact via `polars.LazyFrame <https://docs.pola.rs/api/python/stable/reference/lazyframe/>`__::

                artifact = ln.Artifact.get(key="sequences/mydataset.parquet")
                with artifact.open(engine="polars") as df:
                    # use the `polars.LazyFrame` object similar to a `DataFrame` object

            Open an `AnnData`-like artifact via :class:`~lamindb.core.storage.AnnDataAccessor`::

                import lamindb as ln

                artifact = ln.Artifact.get(key="scrna/mydataset.h5ad")
                with artifact.open() as adata:
                    # use the `AnnDataAccessor` similar to an `AnnData` object

            For more examples and background, see guide: :doc:`/arrays`.

        """
        from ..core.storage._backed_access import _track_writes_factory, backed_access
        from ..core.storage._polars_lazy_df import POLARS_SUFFIXES
        from ..core.storage._pyarrow_dataset import PYARROW_SUFFIXES

        if self._overwrite_versions and not self.is_latest:
            raise ValueError(OUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG)
        # all hdf5 suffixes including gzipped
        h5_suffixes = [".h5", ".hdf5", ".h5ad"]
        h5_gz_suffixes = []
        for s in h5_suffixes:
            h5_gz_suffixes += [s, s + ".gz", s + ".tar.gz"]
        # ignore empty suffix for now
        df_suffixes = tuple(set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES))
        suffixes = (
            (
                "",
                ".zarr",
                ".anndata.zarr",
                ".tiledbsoma",
            )
            + tuple(h5_gz_suffixes)
            + df_suffixes
        )
        suffix = self.suffix
        if suffix not in suffixes:
            raise ValueError(
                "Artifact should have a zarr, h5, tiledbsoma object"
                " or a compatible `pyarrow.dataset.dataset` or `polars.scan_*` directory"
                " as the underlying data, please use one of the following suffixes"
                f" for the object name: {', '.join(suffixes[1:])}."
                f" Or no suffix for a folder with {', '.join(df_suffixes)} files"
                " (no mixing allowed)."
            )
        using_key = settings._using_key
        filepath, cache_key = _s().filepath_cache_key_from_artifact(
            self, using_key=using_key
        )

        is_tiledbsoma_w = (
            filepath.name == "soma" or suffix == ".tiledbsoma"
        ) and mode == "w"
        is_zarr_w = suffix == ".zarr" and mode == "r+"

        if mode != "r":
            if not (is_tiledbsoma_w or is_zarr_w):
                raise ValueError(
                    f"It is not allowed to open a {suffix} object with `mode='{mode}'`. "
                    "You can open all supported formats with `mode='r'`, "
                    "a tiledbsoma store with `mode='w'`, "
                    "AnnData or SpatialData zarr store with `mode='r+'`."
                )
            elif not self.overwrite_versions:
                raise ValueError(
                    "It is not possible to open artifacts having `overwrite_versions=False` "
                    "in non-read mode (other than `mode='r'`)."
                )
        # consider the case where an object is already locally cached
        localpath = setup_settings.paths.cloud_to_local_no_update(
            filepath, cache_key=cache_key
        )
        if is_tiledbsoma_w or is_zarr_w:
            open_cache = False
        else:
            open_cache = not isinstance(
                filepath, LocalPathClasses
            ) and not filepath.synchronize_to(localpath, just_check=True)
        if open_cache:
            try:
                access = backed_access(
                    localpath, mode, engine, using_key=using_key, **kwargs
                )
            except Exception as e:
                # also ignore ValueError here because
                # such errors most probably just imply an incorrect argument
                if isinstance(e, (ImportError, ValueError)) or isinstance(
                    filepath, LocalPathClasses
                ):
                    raise e
                logger.warning(
                    f"The cache might be corrupted: {e}. Trying to open directly."
                )
                access = backed_access(
                    filepath, mode, engine, using_key=using_key, **kwargs
                )
                # happens only if backed_access has been successful
                # delete the corrupted cache
                if localpath.is_dir():
                    shutil.rmtree(localpath)
                else:
                    localpath.unlink(missing_ok=True)
        else:
            access = backed_access(self, mode, engine, using_key=using_key, **kwargs)
            if is_tiledbsoma_w:

                def finalize():
                    nonlocal self, filepath, localpath
                    if not isinstance(filepath, LocalPathClasses):
                        _, hash, _, _ = get_stat_dir_cloud(filepath)
                    else:
                        # this can be very slow
                        _, hash, _, _ = hash_dir(filepath)
                    if self.hash != hash:
                        from .sqlrecord import init_self_from_db

                        new_version = Artifact(
                            filepath, revises=self, _is_internal_call=True
                        ).save()
                        # note: sets _state.db = "default"
                        init_self_from_db(self, new_version)

                        if localpath != filepath and localpath.exists():
                            shutil.rmtree(localpath)

                access = _track_writes_factory(access, finalize)
        # only call if open is successfull
        track_run_input(self, is_run_input)
        return access

    def load(
        self, *, is_run_input: bool | None = None, mute: bool = False, **kwargs
    ) -> (
        pd.DataFrame
        | ScverseDataStructures
        | dict[str, Any]
        | list[Any]
        | AnyPathStr
        | None
    ):
        """Cache artifact in local cache and then load it into memory.

        See: :mod:`~lamindb.core.loaders`.

        Args:
            is_run_input: Whether to track this artifact as run input.
            mute: Silence logging of caching progress.
            **kwargs: Keyword arguments for the loader.

        Examples:

            Load a `DataFrame`-like artifact::

                df = artifact.load()

            Load an `AnnData`-like artifact::

                adata = artifact.load()
        """
        from ..core.loaders import load_to_memory

        if self._overwrite_versions and not self.is_latest:
            raise ValueError(OUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG)

        if hasattr(self, "_memory_rep") and self._memory_rep is not None:
            access_memory = self._memory_rep
            # SpatialData objects zarr stores are moved when saved
            # SpatialData's __repr__ method attempts to access information from the old path
            # Therefore, we need to update the in-memory path to the now moved Artifact storage path
            if access_memory.__class__.__name__ == "SpatialData":
                access_memory.path = self._cache_path
        else:
            filepath, cache_key = _s().filepath_cache_key_from_artifact(
                self, using_key=settings._using_key
            )
            cache_path = _synchronize_cleanup_on_error(
                filepath, cache_key=cache_key, print_progress=not mute
            )
            try:
                # cache_path is local so doesn't trigger any sync in load_to_memory
                access_memory = load_to_memory(cache_path, **kwargs)
            except Exception as e:
                # raise the exception if it comes from not having a correct loader
                # import error is also most probbaly not a problem with the cache
                # or if the original path is local
                if isinstance(e, (NotImplementedError, ImportError)) or isinstance(
                    filepath, LocalPathClasses
                ):
                    raise e
                logger.warning(
                    f"The cache might be corrupted: {e}. Retrying to synchronize."
                )
                # delete the existing cache
                if cache_path.is_dir():
                    shutil.rmtree(cache_path)
                else:
                    cache_path.unlink(missing_ok=True)
                # download again and try to load into memory
                cache_path = _synchronize_cleanup_on_error(
                    filepath, cache_key=cache_key, print_progress=not mute
                )
                access_memory = load_to_memory(cache_path, **kwargs)
        # only call if load is successfull
        track_run_input(self, is_run_input)

        return access_memory

    def cache(
        self, *, is_run_input: bool | None = None, mute: bool = False, **kwargs
    ) -> UPath:
        """Download cloud artifact to local cache.

        Follows synching logic: only caches an artifact if it's outdated in the local cache.

        Returns a path to a locally cached on-disk object (say a `.jpg` file).

        Args:
            mute: Silence logging of caching progress.
            is_run_input: Whether to track this artifact as run input.

        Example:

            Sync the artifact from the cloud and return the local path to the cached file::

                artifact.cache()
                #> PosixPath('/home/runner/work/Caches/lamindb/lamindata/pbmc68k.h5ad')
        """
        if self._overwrite_versions and not self.is_latest:
            raise ValueError(OUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG)

        filepath, cache_key = _s().filepath_cache_key_from_artifact(
            self, using_key=settings._using_key
        )
        if mute:
            kwargs["print_progress"] = False
        cache_path = _synchronize_cleanup_on_error(
            filepath, cache_key=cache_key, **kwargs
        )
        # only call if sync is successfull
        track_run_input(self, is_run_input)
        return cache_path

    def delete(
        self,
        permanent: bool | None = None,
        storage: bool | None = None,
        using_key: str | None = None,
    ) -> None:
        """Trash or permanently delete.

        A first call to `.delete()` puts an artifact into the trash (sets `branch_id` to `-1`).
        A second call permanently deletes the artifact.

        For an `artifact` that has multiple versions and for which `artifact.overwrite_versions is True`, the default behavior for folders,
        deleting a non-latest version will not delete the underlying storage unless `storage=True` is passed.
        Deleting the latest version will delete all versions.

        Args:
            permanent: Permanently delete the artifact (skip trash).
            storage: Indicate whether you want to delete the artifact in storage.

        Examples:

            Delete a single file artifact::

                import lamindb as ln

                artifact = ln.Artifact.get(key="some.csv")
                artifact.delete() # delete a single file artifact

            Delete an old version of a folder-like artifact::

                artifact = ln.Artifact.filter(key="folder.zarr", is_latest=False).first()
                artiact.delete() # delete an old version, the data will not be deleted

            Delete all versions of a folder-like artifact::

                artifact = ln.Artifact.get(key="folder.zarr". is_latest=True)
                artifact.delete() # delete all versions, the data will be deleted or prompted for deletion.
        """
        super().delete(permanent=permanent, storage=storage, using_key=using_key)

    # TODO: consider renaming the transfer argument to sync
    def save(
        self,
        upload: bool | None = None,
        transfer: Literal["record", "annotations"] = "record",
        **kwargs,
    ) -> Artifact:
        """Save to database & storage.

        Args:
            upload: Trigger upload to cloud storage in instances with hybrid storage mode.
            transfer: In case artifact was queried on a different instance, dictates behavior of sync.
                If "record", only the artifact record is synced to the current instance.
                If "annotations", also the annotations linked in the source instance are synced.

        See Also:
            :doc:`sync`

        Example:

            Save a file-like artifact after creating it with the default constructor `Artifact()`::

                import lamindb as ln

                artifact = ln.Artifact("./myfile.csv", key="myfile.parquet").save()
        """
        if (
            not self._state.adding
            # skip on is_latest change
            # no need to check if saved because it is checked above
            and not self._field_changed("is_latest", check_is_saved=False)
            and not self.is_latest
            and self.branch_id != -1  # skip on soft deletion
        ):
            logger.warning("you are saving to a non-latest version of the artifact")

        access_token = kwargs.pop("access_token", None)

        current_instance_uid = setup_settings.instance.uid

        artifact_storage = self.storage
        artifact_storage_instance_uid = artifact_storage.instance_uid
        is_not_artifact_storage_managed_by_current_instance = (
            artifact_storage_instance_uid != current_instance_uid
        )

        if self._field_changed("key", check_is_saved=False):
            new_key = self.key
            if new_key is None:
                raise InvalidArgument("Cannot update an artifact key to None.")
            new_key_suffix = extract_suffix_from_path(
                PurePosixPath(new_key), arg_name="key"
            )
            if new_key_suffix != self.suffix:
                raise InvalidArgument(
                    f"The suffix '{new_key_suffix}' of the provided key is incorrect, it should be '{self.suffix}'."
                )
            # Virtual key updates are metadata-only because physical storage keys are
            # uid-based.
            if self._key_is_virtual:
                self._original_values["key"] = new_key
            else:
                if self._state.adding:
                    raise InvalidArgument(
                        "Cannot update the key of an artifact before it is saved."
                    )
                if is_not_artifact_storage_managed_by_current_instance:
                    raise InvalidArgument(
                        "Cannot update a non-virtual key of an artifact"
                        " in a storage location that is not managed by the current instance."
                    )
                old_key = self._original_values["key"]
                if old_key is None:
                    raise InvalidArgument(
                        "Cannot update a non-virtual artifact key from None."
                    )
                if not _handle_non_virtual_key_change_on_save(
                    self, old_key=old_key, new_key=new_key
                ):
                    return None

        if self._field_changed("suffix", check_is_saved=False):
            if self._state.adding:
                raise InvalidArgument(
                    "Cannot update the suffix of an artifact before it is saved."
                )
            if is_not_artifact_storage_managed_by_current_instance:
                raise InvalidArgument(
                    "Cannot update the suffix of an artifact"
                    " in a storage location that is not managed by the current instance."
                )
            if not _handle_suffix_change_on_save(self):
                return None

        # when space is passed in init, storage is ignored, so space - storage consistency is enforced there
        if (
            self._field_changed("space_id")
            # here we check for storages managed by any instance
            # not necessarily with managed credentials
            # we check if the artifact storage is managed by the current instance further
            and artifact_storage_instance_uid is not None
        ):
            if is_not_artifact_storage_managed_by_current_instance:
                raise ValueError(
                    "Cannot change the space of an artifact"
                    " in a storage location that is not managed by the current instance."
                )
            space = self.space
            storage_type = artifact_storage.type
            storages = Storage.connect(self._state.db).filter(
                space=space, instance_uid=current_instance_uid, type=storage_type
            )
            n_storages = storages.count()
            if n_storages == 0:
                raise ValueError(
                    f"No {storage_type} storage locations managed by the current instance found for the space '{space.name}'."
                )
            elif n_storages > 1:
                storages = storages.order_by("id")
                roots_str = "\n".join(
                    f"{i}: {storage.root}" for i, storage in enumerate(storages)
                )
                choice = input(
                    f"Select a storage location of type '{storage_type}' from the target space '{space.name}':"
                    f" \n{roots_str}\n"
                    "Enter the number or 'x' to cancel: "
                )
                if choice == "x":
                    logger.warning("saving was cancelled")
                    return None
                storage = storages[int(choice)]
            else:
                storage = storages.one()
            if artifact_storage != storage:
                # try to transfer if both storages are writable / managed by an instance
                # replaces artifact.storage with the new storage if successful
                _move_artifact_to_storage(self, storage, access_token=access_token)
            else:
                logger.important("artifact is already in the target storage location")
            # Keep tracked values in sync after handling a space update so
            # repeated saves don't keep re-running this branch.
            self._original_values["space_id"] = self.space_id

        if transfer not in {"record", "annotations"}:
            raise ValueError(
                f"transfer should be either 'record' or 'annotations', not {transfer}"
            )
        else:
            kwargs["transfer"] = transfer
        state_was_adding = self._state.adding
        print_progress = kwargs.pop("print_progress", True)
        store_kwargs = kwargs.pop(
            "store_kwargs", {}
        )  # kwargs for .upload_from in the end
        local_path = None
        if upload and setup_settings.instance.keep_artifacts_local:
            # switch local storage location to cloud
            local_path = self.path
            self.storage_id = setup_settings.instance.storage._id
            self._local_filepath = local_path
            # switch to virtual storage key upon upload
            # the local filepath is already cached at that point
            self._key_is_virtual = True
            # ensure that the artifact is uploaded
            self._to_store = True

        local_filepath = getattr(self, "_local_filepath", None)
        has_local_filepath = local_filepath is not None
        if has_local_filepath and not local_filepath.exists():
            raise FileNotFoundError(
                f"Unable to save the artifact because the local path {local_filepath} does not exist."
            )

        flag_complete = has_local_filepath and getattr(self, "_to_store", False)
        if flag_complete:
            if is_not_artifact_storage_managed_by_current_instance:
                raise ValueError(
                    "Cannot save an artifact to a storage location that is not managed by the current instance."
                )
            # _storage_ongoing indicates whether the storage saving / upload process is ongoing
            self._storage_ongoing = True  # will be updated to False once complete

        self._save_skip_storage(**kwargs)

        using_key = None
        if "using" in kwargs:
            using_key = kwargs["using"]
        exception_upload = check_and_attempt_upload(
            self,
            using_key,
            access_token=access_token,
            print_progress=print_progress,
            **store_kwargs,
        )
        if exception_upload is not None:
            # we do not want to raise file not found on cleanup if upload of a file failed
            # often it is ACID in the filesystem itself
            # for example, s3 won't have the failed file, so just skip the delete in this case
            raise_file_not_found_error = False
            self._delete_skip_storage()
        else:
            # this is the case when it is cleaned on .replace
            raise_file_not_found_error = True
        # this is triggered by an exception in check_and_attempt_upload or by replace.
        exception_clear = check_and_attempt_clearing(
            self,
            raise_file_not_found_error=raise_file_not_found_error,
            using_key=using_key,
        )
        if exception_upload is not None:
            raise exception_upload
        if exception_clear is not None:
            raise exception_clear
        # the saving / upload process has been successful
        if flag_complete:
            self._storage_ongoing = False
            # pass kwargs below because it can contain `using` or other things
            # affecting the connection
            super().save(**kwargs)

        # this is only for keep_artifacts_local
        if local_path is not None and not state_was_adding:
            # only move the local artifact to cache if it was not newly created
            local_path_cache = ln_setup.settings.cache_dir / local_path.name
            # don't use Path.rename here because of cross-device link error
            # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969
            shutil.move(
                local_path,  # type: ignore
                local_path_cache,
            )
            logger.important(f"moved local artifact to cache: {local_path_cache}")

        # annotate with external features
        if hasattr(self, "_external_features"):
            external_features = self._external_features
            self.features.set_values(external_features)
        # annotate with internal features based on curator
        if hasattr(self, "_curator"):
            curator = self._curator
            del self._curator
            # just annotates this artifact
            curator.save_artifact()
        if hasattr(self, "_external_features"):
            del self._external_features
        if hasattr(self, "_local_filepath"):
            del self._local_filepath
        return self


def _update_artifact_keys_with_suffix(artifact: Artifact, suffix: str):
    key = artifact.key
    real_key = artifact._real_key
    if key is not None:
        new_key = PurePosixPath(key).with_suffix(suffix).as_posix()
        artifact.key = new_key
    if real_key is not None:
        artifact._real_key = PurePosixPath(real_key).with_suffix(suffix).as_posix()


def _confirm_artifact_move(source_path_str: str, target_path_str: str) -> bool:
    # ask for confirmation
    # TODO: add a way to disable confirmation
    response = input(
        f"You are about to move artifact from '{source_path_str}' to '{target_path_str}'.\n"
        "Continue? (y/n) "
    )
    if response != "y":
        logger.warning("saving was cancelled")
        return False
    return True


def _handle_non_virtual_key_change_on_save(
    artifact: Artifact, *, old_key: str, new_key: str
) -> bool:
    # _real_key should actually be None here because it goes with virtual key
    source_storage_key = (
        artifact._real_key if artifact._real_key is not None else old_key
    )
    source_path = artifact.storage.path / source_storage_key
    # key was updated, so artifact.path is the new path
    target_path_str = artifact.path.as_posix()
    source_path_str = source_path.as_posix()
    if not _confirm_artifact_move(source_path_str, target_path_str):
        return False
    _safe_move(source_path.fs, source_path_str, target_path_str)
    if artifact._real_key is not None:
        artifact._real_key = new_key
    # Keep tracked values in sync so repeated saves don't trigger another move.
    artifact._original_values["key"] = new_key
    # If key change already applied the suffix transition, skip suffix handling below.
    artifact._original_values["suffix"] = artifact.suffix
    return True


def _handle_suffix_change_on_save(artifact: Artifact) -> bool:
    suffix = artifact.suffix
    # depends on whether key is virtual or real key is present
    source_or_target_path = artifact.path
    source_path_str = source_or_target_path.with_suffix(
        artifact._original_values["suffix"]
    ).as_posix()
    target_path_str = source_or_target_path.with_suffix(suffix).as_posix()
    if not _confirm_artifact_move(source_path_str, target_path_str):
        return False
    # source_path and target_path are on the same filesystem
    _safe_move(source_or_target_path.fs, source_path_str, target_path_str)
    _update_artifact_keys_with_suffix(artifact, suffix)
    # Keep tracked values in sync so consecutive suffix updates on the same
    # in-memory instance trigger a move each time.
    artifact._original_values["suffix"] = suffix
    artifact._original_values["key"] = artifact.key
    return True


def _sorted_sizes(fs: AbstractFileSystem, path: str) -> list[int]:
    objects = fs.find(path, detail=True)
    return sorted(info["size"] for info in objects.values())


def _rm_catch_error(fs: AbstractFileSystem, path: str) -> Exception | None:
    if fs.exists(path):
        try:
            fs.rm(path, recursive=True)
        except Exception as rm_exc:
            return rm_exc
    return None


def _safe_move(fs: AbstractFileSystem, source: str, target: str):
    if fs.exists(target):
        raise FileExistsError(
            f"Cannot move artifact to '{target}' because it already exists."
        )
    logger.important(f"moving artifact from '{source}' to '{target}'")
    try:
        fs.copy(source, target, recursive=True)
    except Exception as e:
        message = "Failed to copy artifact to target storage during transfer."
        cleanup_error = _rm_catch_error(fs, target)
        if cleanup_error is not None:
            message += f" Cleanup of copied target also failed: {cleanup_error}"
        raise RuntimeError(message) from e
    # check that the sizes of the files are the same
    if _sorted_sizes(fs, source) != _sorted_sizes(fs, target):
        message = "Move verification failed: copied artifact does not match source."
        cleanup_error = _rm_catch_error(fs, target)
        if cleanup_error is not None:
            message += " Cleanup of copied target also failed."
        raise RuntimeError(message) from cleanup_error

    try:
        fs.rm(source, recursive=True)
    except Exception as e:
        logger.error(
            f"copying to '{target}' succeeded but failed to remove source '{source}': {e}"
        )


def _move_artifact_to_storage(
    artifact: Artifact, storage: Storage, access_token: str | None = None
):
    storage_key = _s().auto_storage_key_from_artifact(artifact)

    source_path = artifact.path
    target_path = storage.path / storage_key
    if source_path == target_path:
        raise ValueError("Cannot move to the same path.")

    fs = fs_for_moving(source_path, target_path, access_token=access_token)

    source_path_str = str(source_path)
    target_path_str = str(target_path)

    _safe_move(fs, source_path_str, target_path_str)

    artifact.storage_id = storage.id


# can't really just call .cache in .load because of double tracking
def _synchronize_cleanup_on_error(
    filepath: UPath, cache_key: str | None = None, **kwargs
) -> UPath:
    try:
        print_progress = kwargs.pop("print_progress", True)
        cache_path = setup_settings.paths.cloud_to_local(
            filepath, cache_key=cache_key, print_progress=print_progress, **kwargs
        )
    except Exception as e:
        if not isinstance(filepath, LocalPathClasses):
            cache_path = setup_settings.paths.cloud_to_local_no_update(
                filepath, cache_key=cache_key
            )
            if cache_path.is_dir():
                shutil.rmtree(cache_path)
            else:
                cache_path.unlink(missing_ok=True)
        raise e
    return cache_path


def _delete_skip_storage(artifact, *args, **kwargs) -> None:
    super(SQLRecord, artifact).delete(*args, **kwargs)


def _save_skip_storage(artifact, **kwargs) -> None:
    save_staged_schemas(artifact)
    super(Artifact, artifact).save(**kwargs)
    save_schema_links(artifact)


class ArtifactJsonValue(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_jsonvalue")
    # we follow the lower() case convention rather than snake case for link models
    jsonvalue: JsonValue = ForeignKey(JsonValue, PROTECT, related_name="links_artifact")

    class Meta:
        app_label = "lamindb"
        unique_together = ("artifact", "jsonvalue")


class ArtifactUser(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="links_user")
    user: User = ForeignKey(User, PROTECT, related_name="links_artifact")
    feature: Feature | None = ForeignKey(
        Feature, PROTECT, null=True, related_name="links_artifactuser", default=None
    )

    class Meta:
        # can have the same label linked to the same artifact if the feature is
        # different
        app_label = "lamindb"
        unique_together = ("artifact", "user", "feature")


class ArtifactRun(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="links_run")
    # consciously choosing CASCADE
    run: Run = ForeignKey(Run, CASCADE, related_name="links_artifact")
    feature: Feature | None = ForeignKey(
        Feature, PROTECT, null=True, related_name="links_artifactrun", default=None
    )

    class Meta:
        # can have the same label linked to the same artifact if the feature is
        # different
        app_label = "lamindb"
        unique_together = ("artifact", "run", "feature")


class ArtifactArtifact(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="links_artifact")
    # consciously choosing CASCADE
    value: Artifact = ForeignKey("Artifact", CASCADE, related_name="links_value")
    feature: Feature | None = ForeignKey(
        Feature, PROTECT, null=True, related_name="links_artifactartifact", default=None
    )

    class Meta:
        # can have the same label linked to the same artifact if the feature is
        # different
        app_label = "lamindb"
        unique_together = ("artifact", "value", "feature")


def track_run_input(
    record: (
        Artifact | Iterable[Artifact]
    ),  # can also be Collection | Iterable[Collection]
    is_run_input: bool | Run | None = None,
    run: Run | None = None,
) -> None:
    """Links a record as an input to a run.

    This function contains all validation logic to make decisions on whether a
    record qualifies as an input or not.
    """
    if is_run_input is False:
        return None

    from ..core._context import context
    from ..core._functions import get_current_tracked_run
    from .collection import Collection

    if isinstance(is_run_input, Run):
        run = is_run_input
        is_run_input = True
    elif run is None:
        run = get_current_tracked_run()
        if run is None:
            run = context.run
    # consider that record is an iterable of Data
    record_iter: Iterable[Artifact] | Iterable[Collection] = (
        [record] if isinstance(record, (Artifact, Collection)) else record
    )
    input_records = []
    if run is not None:
        assert not run._state.adding, "Save the run before tracking its inputs."  # noqa: S101

        def is_valid_input(record: Artifact | Collection):
            is_valid = False
            # if a record is not yet saved it has record._state.db = None
            # then it can't be an input
            # we silently ignore because what will happen is that
            # the record either gets saved and then is tracked as an output
            # or it won't get saved at all
            if record._state.db == "default":
                # things are OK if the record is on the default db
                is_valid = True
            else:
                # record is on another db
                # we have to save the record into the current db with
                # the run being attached to a transfer transform
                logger.info(
                    f"completing transfer to track {record.__class__.__name__}('{record.uid}') as input"
                )
                record.save()
                is_valid = True
            # avoid cycles: record can't be both input and output
            if record.run_id == run.id:
                logger.debug(
                    f"not tracking {record} as input to run {run} because created by same run"
                )
                is_valid = False
            if run.id == getattr(record, "_subsequent_run_id", None):
                logger.debug(
                    f"not tracking {record} as input to run {run} because re-created in same run"
                )
                is_valid = False
            return is_valid

        input_records = [record for record in record_iter if is_valid_input(record)]
        input_records_ids = [record.id for record in input_records]
    if input_records:
        record_class_name = input_records[0].__class__.__name__.lower()
    # let us first look at the case in which the user does not
    # provide a boolean value for `is_run_input`
    # hence, we need to determine whether we actually want to
    # track a run or not
    track = False
    is_run_input = settings.track_run_inputs if is_run_input is None else is_run_input
    if is_run_input:
        if run is None:
            isettings = setup_settings.instance
            if not (isettings._is_clone or isettings.is_read_only_connection):
                logger.warning(WARNING_NO_INPUT)
        elif input_records:
            logger.debug(
                f"adding {record_class_name} ids {input_records_ids} as inputs for run {run.id}"
            )
            track = True
    else:
        track = is_run_input
    if not track or not input_records:
        return None
    if run is None:
        raise ValueError("No run context set. Call `ln.track()`.")
    if record_class_name == "artifact":
        IsLink = run.input_artifacts.through
        links = [
            IsLink(run_id=run.id, artifact_id=record_id)
            for record_id in input_records_ids
        ]
    else:
        IsLink = run.input_collections.through
        links = [
            IsLink(run_id=run.id, collection_id=record_id)
            for record_id in input_records_ids
        ]
    try:
        IsLink.objects.bulk_create(links, ignore_conflicts=True)
    except ProgrammingError as e:
        if "new row violates row-level security policy" in str(e):
            instance = setup_settings.instance
            available_spaces = instance.available_spaces
            if available_spaces is None:
                raise NoWriteAccess(
                    f"You’re not allowed to write to the instance {instance.slug}.\n"
                    "Please contact administrators of the instance if you need write access."
                ) from None
            write_access_spaces = available_spaces["admin"] + available_spaces["write"]
            no_write_access_spaces = {
                record_space
                for record in input_records
                if (record_space := record.space) not in write_access_spaces
            }
            if (run_space := run.space) not in write_access_spaces:
                no_write_access_spaces.add(run_space)

            if not no_write_access_spaces:
                # if there are no unavailable spaces, then this should be due to locking
                locked_records = [
                    record
                    for record in input_records
                    if getattr(record, "is_locked", False)
                ]
                if run.is_locked:
                    locked_records.append(run)
                # if no unavailable spaces and no locked records, just raise the original error
                if not locked_records:
                    raise e
                no_write_msg = (
                    "It is not allowed to modify locked records: "
                    + ", ".join(
                        r.__class__.__name__ + f"(uid={r.uid})" for r in locked_records
                    )
                    + "."
                )
                raise NoWriteAccess(no_write_msg) from None

            if len(no_write_access_spaces) > 1:
                name_msg = ", ".join(
                    f"'{space.name}'" for space in no_write_access_spaces
                )
                space_msg = "spaces"
            else:
                name_msg = f"'{no_write_access_spaces.pop().name}'"
                space_msg = "space"
            raise NoWriteAccess(
                f"You’re not allowed to write to the {space_msg} {name_msg}.\n"
                f"Please contact administrators of the {space_msg} if you need write access."
            ) from None
        else:
            raise e


# privates currently dealt with separately
# mypy: ignore-errors
Artifact._delete_skip_storage = _delete_skip_storage
Artifact._save_skip_storage = _save_skip_storage
Artifact.view_lineage = view_lineage


# PostgreSQL migration helper for _save_completed to _aux["storage_completed"]


def migrate_save_completed_to_aux_postgres(schema_editor) -> None:
    """Migrate _save_completed field to _aux['storage_completed'] using PostgreSQL raw SQL.

    This migrates _save_completed=False into _aux['storage_completed']=false.
    _save_completed=True results in no change to _aux (empty JSON is the default).
    """
    schema_editor.execute("""
        UPDATE lamindb_artifact
        SET _aux = CASE
                WHEN _save_completed = FALSE THEN
                    CASE
                        WHEN _aux IS NULL THEN
                            jsonb_build_object('storage_completed', false)
                        ELSE
                            _aux || jsonb_build_object('storage_completed', false)
                    END
                ELSE _aux
            END,
            _save_completed = NULL
        WHERE _save_completed IS NOT NULL
    """)


================================================
FILE: lamindb/models/artifact_set.py
================================================
from __future__ import annotations

from collections.abc import Iterable, Iterator
from typing import TYPE_CHECKING, Literal

from django.db.models import Case, Q, TextField, Value, When
from django.db.models.functions import Concat
from lamin_utils import logger
from lamindb_setup.core._docs import doc_args
from upath import UPath

from .artifact import Artifact, track_run_input
from .collection import Collection, _load_concat_artifacts

if TYPE_CHECKING:
    from anndata import AnnData
    from lamindb_setup.types import AnyPathStr
    from pandas import DataFrame
    from polars import LazyFrame as PolarsLazyFrame
    from pyarrow.dataset import Dataset as PyArrowDataset

    from ..core._mapped_collection import MappedCollection


UNORDERED_WARNING = (
    "this query set is unordered, consider using `.order_by()` first "
    "to avoid opening the artifacts in an arbitrary order"
)


# maybe make this abstract
class ArtifactSet(Iterable):
    """Abstract class representing sets of artifacts returned by queries.

    This class automatically extends :class:`~lamindb.models.BasicQuerySet`
    and :class:`~lamindb.models.QuerySet` when the base model is :class:`~lamindb.Artifact`.

    Examples:

        >>> artifacts = ln.Artifact.filter(otype="AnnData")
        >>> artifacts # an instance of ArtifactQuerySet inheriting from ArtifactSet
    """

    @doc_args(Collection.load.__doc__)
    def load(
        self,
        join: Literal["inner", "outer"] = "outer",
        is_run_input: bool | None = None,
        **kwargs,
    ) -> DataFrame | AnnData:
        """{}"""  # noqa: D415
        if not self.ordered:  # type: ignore
            logger.warning(UNORDERED_WARNING)

        artifacts: list[Artifact] = list(self)
        concat_object = _load_concat_artifacts(artifacts, join, **kwargs)
        # track only if successful
        track_run_input(artifacts, is_run_input)
        return concat_object

    @doc_args(Collection.open.__doc__)
    def open(
        self,
        engine: Literal["pyarrow", "polars"] = "pyarrow",
        is_run_input: bool | None = None,
        **kwargs,
    ) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
        """{}"""  # noqa: D415
        from ..core.storage._backed_access import _open_dataframe

        if not self.ordered:  # type: ignore
            logger.warning(UNORDERED_WARNING)

        artifacts: list[Artifact] = list(self)
        paths: list[UPath] = [artifact.path for artifact in artifacts]

        dataframe = _open_dataframe(paths, engine=engine, **kwargs)
        # track only if successful
        track_run_input(artifacts, is_run_input)
        return dataframe

    @doc_args(Collection.mapped.__doc__)
    def mapped(
        self,
        layers_keys: str | list[str] | None = None,
        obs_keys: str | list[str] | None = None,
        obsm_keys: str | list[str] | None = None,
        obs_filter: dict[str, str | list[str]] | None = None,
        join: Literal["inner", "outer"] | None = "inner",
        encode_labels: bool | list[str] = True,
        unknown_label: str | dict[str, str] | None = None,
        cache_categories: bool = True,
        parallel: bool = False,
        dtype: str | None = None,
        stream: bool = False,
        is_run_input: bool | None = None,
    ) -> MappedCollection:
        """{}"""  # noqa: D415
        from ..core._mapped_collection import MappedCollection

        if not self.ordered:  # type: ignore
            logger.warning(UNORDERED_WARNING)

        artifacts: list[Artifact] = []
        paths: list[UPath] = []
        for artifact in self:
            if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
                logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
                continue
            elif not stream:
                paths.append(artifact.cache())
            else:
                paths.append(artifact.path)
            artifacts.append(artifact)
        ds = MappedCollection(
            paths,
            layers_keys,
            obs_keys,
            obsm_keys,
            obs_filter,
            join,
            encode_labels,
            unknown_label,
            cache_categories,
            parallel,
            dtype,
        )
        # track only if successful
        track_run_input(artifacts, is_run_input)
        return ds


def artifacts_from_path(artifacts: ArtifactSet, path: AnyPathStr) -> ArtifactSet:
    """Returns artifacts in the query set that are registered for the provided path."""
    from lamindb.models import BasicQuerySet, QuerySet

    # not QuerySet but only BasicQuerySet
    assert isinstance(artifacts, BasicQuerySet) and not isinstance(artifacts, QuerySet)  # noqa: S101

    upath = UPath(path)

    path_str = upath.as_posix()

    stem = upath.stem
    stem_len = len(stem)

    if stem_len == 16:
        qs = artifacts.filter(
            Q(_key_is_virtual=True) | Q(key__isnull=True),
            _real_key__isnull=True,
            uid__startswith=stem,
        )
    elif stem_len == 20:
        qs = artifacts.filter(
            Q(_key_is_virtual=True) | Q(key__isnull=True),
            _real_key__isnull=True,
            uid=stem,
        )
    else:
        qs = None

    if qs:  # an empty query set evaluates to False
        return qs

    qs = (
        artifacts.filter(Q(_key_is_virtual=False) | Q(_real_key__isnull=False))
        .alias(
            db_path=Case(
                When(
                    _real_key__isnull=False,
                    then=Concat(
                        "storage__root",
                        Value("/"),
                        "_real_key",
                        output_field=TextField(),
                    ),
                ),
                default=Concat(
                    "storage__root", Value("/"), "key", output_field=TextField()
                ),
                output_field=TextField(),
            )
        )
        .filter(db_path=path_str)
    )

    return qs


================================================
FILE: lamindb/models/block.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Literal, get_args, overload

from django.db import models
from django.db.models import (
    CASCADE,
    PROTECT,
    CharField,
    DateTimeField,
    ForeignKey,
    JSONField,
    Q,
    TextField,
)
from lamin_utils import logger
from lamindb_setup.core.hashing import hash_string

from ..base.types import RegistryId
from ..base.uids import base62_16
from ._is_versioned import create_uid, process_revises
from .artifact import Artifact
from .collection import Collection
from .feature import Feature
from .project import Project
from .record import Record
from .run import Run, User, current_user_id
from .schema import Schema
from .sqlrecord import (
    BaseSQLRecord,
    Branch,
    IsVersioned,
    Space,
    SQLRecord,
    init_self_from_db,
    update_attributes,
)
from .transform import Transform

if TYPE_CHECKING:
    from datetime import datetime

    from .query_manager import RelatedManager

_VERSIONED_ATTACHED_KINDS = ("readme",)  # only readme is versioned; comment is not
_VALID_BLOCK_KINDS: tuple[str, ...] = ("readme", "comment")
_BLOCK_ALLOWED_NON_REGISTRY_KEYS: tuple[str, ...] = ("README.md",)


def _init_versioned_attached_block(
    self: BaseBlock,
    fk_field_name: str,
    *args: Any,
    allowed_extra: tuple[str, ...] = (),
    **kwargs: Any,
) -> None:
    cls = type(self)
    if len(args) == len(self._meta.concrete_fields):
        super(cls, self).__init__(*args, **kwargs)
        return None
    if args:
        raise ValueError(
            f"Please only use keyword arguments to construct a {cls.__name__}"
        )
    fk_value = kwargs.pop(fk_field_name, None)
    content = kwargs.pop("content", None)
    kind = kwargs.pop("kind", None)
    version_tag = kwargs.pop("version_tag", kwargs.pop("version", None))
    revises = kwargs.pop("revises", None)
    using = kwargs.pop("using", None)
    uid = kwargs.pop("uid", None) if "uid" in kwargs else None
    default_allowed_extra = ("branch", "branch_id", "created_on", "created_on_id")
    all_allowed_extra = default_allowed_extra + allowed_extra
    extra_kwargs = {k: kwargs.pop(k) for k in all_allowed_extra if k in kwargs}
    allowed = {
        fk_field_name,
        "content",
        "kind",
        "version",
        "version_tag",
        "revises",
        "using",
        "uid",
        *all_allowed_extra,
    }
    if kwargs:
        raise ValueError(
            f"Only {', '.join(sorted(allowed))} can be passed, but you passed: {kwargs}"
        )
    if fk_value is None:
        raise ValueError(f"{fk_field_name} is required for {cls.__name__}")
    if kind is None:
        raise ValueError(
            f"kind is required for {cls.__name__}; use 'readme' or 'comment'"
        )
    if kind not in _VALID_BLOCK_KINDS:
        raise ValueError(f"kind must be 'readme' or 'comment', got {kind!r}")

    if kind == "comment":
        if revises is not None:
            raise ValueError(
                "revises is not allowed for kind='comment'; comments are not versioned"
            )
        new_uid, _ = create_uid(
            revises=None,
            version_tag=version_tag,
            n_full_id=cls._len_full_uid,
        )
        block_hash = hash_string(content) if content else None
        super(cls, self).__init__(
            uid=new_uid,
            content=content or "",
            hash=block_hash,
            kind=kind,
            version_tag=version_tag,
            revises=None,
            **{fk_field_name: fk_value},
            **extra_kwargs,
        )
        return None
    # kind == "readme" (versioned)
    if revises is None and fk_value is not None:
        candidate_for_revises = (
            cls.objects.using(using)
            .filter(
                **{fk_field_name: fk_value},
                kind=kind,
                is_latest=True,
            )
            .order_by("-created_at")
            .first()
        )
        if candidate_for_revises is not None:
            revises = candidate_for_revises
            content_blank = getattr(revises, "content", None) in (None, "")
            if content_blank:
                logger.important(
                    "no content was yet saved, returning existing "
                    f"block with same {fk_field_name} and kind"
                )
                uid = revises.uid
    if revises is not None and uid is not None and uid == revises.uid:
        init_self_from_db(self, revises)
        update_attributes(self, {})
        return None
    new_uid, revises = create_uid(
        revises=revises,
        version_tag=version_tag,
        n_full_id=cls._len_full_uid,
    )
    if uid is None:
        uid = new_uid
    block_hash = hash_string(content) if content else None
    super(cls, self).__init__(
        uid=uid,
        content=content or "",
        hash=block_hash,
        kind=kind,
        version_tag=version_tag,
        revises=revises,
        **{fk_field_name: fk_value},
        **extra_kwargs,
    )


class BaseBlock(IsVersioned):
    class Meta:
        abstract = True

    _len_full_uid: int = 20
    _len_stem_uid: int = 16

    id = models.BigAutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(
        editable=False,
        unique=True,
        db_index=True,
        max_length=_len_full_uid,
        default=base62_16,
    )
    """Universal id."""
    content: str = TextField()
    """Content of the block."""
    hash: str = CharField(max_length=22, db_index=True, null=True)
    """Content hash of the block."""
    kind: str = CharField(
        max_length=22, db_index=True, default="readme", db_default="readme"
    )
    """The kind of block."""
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of creation of record."""
    created_by: User = ForeignKey(
        "lamindb.User", PROTECT, default=current_user_id, related_name="+"
    )
    """Creator of block."""
    _status_code: int = models.SmallIntegerField(default=0, db_default=0, db_index=True)
    """Status code."""
    _aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True)
    """Auxiliary field for dictionary-like metadata."""


class Block(BaseBlock, SQLRecord):
    """An experimental markdown block for anything: issues, standalone markdown pages, comments, etc.

    The `Block` model is experimental and may change in the future.
    """

    class Meta:
        app_label = "lamindb"

    # same key as in transform/artifact/collection
    key: str | None = CharField(max_length=1024, db_index=True, null=True)
    """The key for which we want to create a block."""
    anchor: Block | None = ForeignKey(
        "Block", PROTECT, related_name="children", null=True
    )
    """The anchor of this block.

    For a comment, could be the issue on which the comment is attached.

    For a sub-post, could be the parent post.
    """
    projects: RelatedManager[Project]
    """Projects that annotate this block."""
    anchors: RelatedManager[Block]
    """This block anchors these blocks."""

    @overload
    def __init__(
        self,
        key: str | None = None,
        content: str | None = None,
        kind: Literal["readme"] = ...,
        version: str | None = None,
        revises: Block | None = None,
        anchor: Block | None = None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        if len(args) == len(self._meta.concrete_fields):
            super().__init__(*args, **kwargs)
            return None
        if args:
            raise ValueError("Please only use keyword arguments to construct a Block")
        key = kwargs.pop("key", None)
        content = kwargs.pop("content", None)
        revises = kwargs.pop("revises", None)
        version_tag = kwargs.pop("version_tag", kwargs.pop("version", None))
        kind = kwargs.pop("kind", None)
        anchor = kwargs.pop("anchor", None)
        using = kwargs.pop("using", None)
        uid = kwargs.pop("uid", None) if "uid" in kwargs else None
        branch = kwargs.pop("branch", None)
        branch_id = kwargs.pop("branch_id", 1)
        space = kwargs.pop("space", None)
        space_id = kwargs.pop("space_id", 1)
        if kwargs:
            raise ValueError(
                "Only key, content, kind, version, revises, anchor "
                f"can be passed, but you passed: {kwargs}"
            )
        if kind != "readme":
            raise ValueError("Only kind = 'readme' is supported for block.")
        _registry_ids = get_args(RegistryId)
        allowed_keys = set(_registry_ids).union(_BLOCK_ALLOWED_NON_REGISTRY_KEYS)
        if key is not None and key not in allowed_keys:
            raise ValueError(
                "key must be one of RegistryId or "
                f"{', '.join(_BLOCK_ALLOWED_NON_REGISTRY_KEYS)}: "
                f"{', '.join(_registry_ids)}"
            )
        if revises is not None and not isinstance(revises, Block):
            raise TypeError("`revises` has to be of type `Block`")
        if revises is None:
            if uid is not None:
                revises = (
                    Block.objects.using(using)
                    .filter(
                        uid__startswith=uid[:-4],
                        is_latest=True,
                    )
                    .order_by("-created_at")
                    .first()
                )
            elif key is not None:
                candidate_for_revises = (
                    Block.objects.using(using)
                    .filter(
                        ~Q(branch_id=-1),
                        key=key,
                        is_latest=True,
                    )
                    .order_by("-created_at")
                    .first()
                )
                if candidate_for_revises is not None:
                    revises = candidate_for_revises
                    content_blank = getattr(candidate_for_revises, "content", None) in (
                        None,
                        "",
                    )
                    if content_blank:
                        logger.important(
                            "no content was yet saved, returning existing "
                            "block with same key"
                        )
                        uid = revises.uid
        if revises is not None and uid is not None and uid == revises.uid:
            if revises.key != key:
                logger.warning("ignoring inconsistent key")
            init_self_from_db(self, revises)
            update_attributes(self, {})
            return None
        if revises is not None and key is not None and revises.key != key:
            logger.important(f"renaming block {revises.key} to {key}")
        new_uid, version_tag, key, _, revises = process_revises(
            revises, version_tag, key, None, Block
        )
        if uid is None:
            uid = new_uid
        block_hash = None
        if content is not None:
            block_hash = hash_string(content)
            block_candidate = Block.objects.filter(
                ~Q(branch_id=-1),
                hash=block_hash,
                is_latest=True,
            ).first()
            if block_candidate is not None:
                init_self_from_db(self, block_candidate)
                update_attributes(self, {})
                if key is not None and block_candidate.key != key:
                    logger.warning(
                        f"key {self.key} on existing block differs from "
                        f"passed key {key}, keeping original key"
                    )
                return None
        super().__init__(
            uid=uid,
            key=key,
            content=content or "",
            kind=kind,
            version_tag=version_tag,
            hash=block_hash,
            revises=revises,
            anchor=anchor,
            branch=branch,
            branch_id=branch_id,
            space=space,
            space_id=space_id,
        )


class HasBranch(models.Model):
    class Meta:
        abstract = True

    branch: Branch = ForeignKey(
        Branch,
        PROTECT,
        default=1,
        db_default=1,
        related_name="+",
    )
    """The current branch of the object - changes e.g. on merge events."""
    created_on: Branch = ForeignKey(
        Branch,
        PROTECT,
        default=1,
        db_default=1,
        related_name="+",
    )
    """The branch on which this object was created - never changes."""


class RecordBlock(BaseBlock, BaseSQLRecord, HasBranch):
    """An unstructured notes block that can be attached to a record."""

    class Meta:
        app_label = "lamindb"

    record: Record = ForeignKey(Record, CASCADE, related_name="ablocks")
    """The record to which the block is attached."""

    def __init__(self, *args, **kwargs):
        _init_versioned_attached_block(self, "record", *args, **kwargs)


class ArtifactBlock(BaseBlock, BaseSQLRecord, HasBranch):
    """An unstructured notes block that can be attached to an artifact."""

    class Meta:
        app_label = "lamindb"

    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="ablocks")
    """The artifact to which the block is attached."""

    def __init__(self, *args, **kwargs):
        _init_versioned_attached_block(self, "artifact", *args, **kwargs)


class TransformBlock(BaseBlock, BaseSQLRecord, HasBranch):
    """An unstructured notes block that can be attached to a transform."""

    class Meta:
        app_label = "lamindb"

    transform: Transform = ForeignKey(
        Transform, CASCADE, related_name="ablocks", null=True
    )
    """The transform to which the block is attached."""
    line_number: int | None = models.IntegerField(null=True)
    """The line number in the source code to which the block belongs."""

    def __init__(self, *args, **kwargs):
        _init_versioned_attached_block(
            self, "transform", *args, allowed_extra=("line_number",), **kwargs
        )


class RunBlock(BaseBlock, BaseSQLRecord, HasBranch):
    """An unstructured notes block that can be attached to a run."""

    class Meta:
        app_label = "lamindb"

    run: Run = ForeignKey(Run, CASCADE, related_name="ablocks")
    """The run to which the block is attached."""

    def __init__(self, *args, **kwargs):
        _init_versioned_attached_block(self, "run", *args, **kwargs)


class CollectionBlock(BaseBlock, BaseSQLRecord, HasBranch):
    """An unstructured notes block that can be attached to a collection."""

    class Meta:
        app_label = "lamindb"

    collection: Collection = ForeignKey(
        Collection, CASCADE, related_name="ablocks", null=True
    )
    """The collection to which the block is attached."""

    def __init__(self, *args, **kwargs):
        _init_versioned_attached_block(self, "collection", *args, **kwargs)


class SchemaBlock(BaseBlock, BaseSQLRecord, HasBranch):
    """An unstructured notes block that can be attached to a schema."""

    class Meta:
        app_label = "lamindb"

    schema: Schema = ForeignKey(Schema, CASCADE, related_name="ablocks")
    """The schema to which the block is attached."""

    def __init__(self, *args, **kwargs):
        _init_versioned_attached_block(self, "schema", *args, **kwargs)


class FeatureBlock(BaseBlock, BaseSQLRecord, HasBranch):
    """An unstructured notes block that can be attached to a feature."""

    class Meta:
        app_label = "lamindb"

    feature: Feature = ForeignKey(Feature, CASCADE, related_name="ablocks")
    """The feature to which the block is attached."""

    def __init__(self, *args, **kwargs):
        _init_versioned_attached_block(self, "feature", *args, **kwargs)


class ProjectBlock(BaseBlock, BaseSQLRecord, HasBranch):
    """An unstructured notes block that can be attached to a project."""

    class Meta:
        app_label = "lamindb"

    project: Project = ForeignKey(Project, CASCADE, related_name="ablocks")
    """The project to which the block is attached."""

    def __init__(self, *args, **kwargs):
        _init_versioned_attached_block(self, "project", *args, **kwargs)


class SpaceBlock(BaseBlock, BaseSQLRecord, HasBranch):
    """An unstructured notes block that can be attached to a space."""

    class Meta:
        app_label = "lamindb"

    space: Space = ForeignKey(Space, CASCADE, related_name="ablocks")
    """The space to which the block is attached."""

    def __init__(self, *args, **kwargs):
        _init_versioned_attached_block(self, "space", *args, **kwargs)


class ULabelBlock(BaseBlock, BaseSQLRecord, HasBranch):
    """An unstructured notes block that can be attached to a ulabel."""

    class Meta:
        app_label = "lamindb"

    ulabel = ForeignKey("ULabel", CASCADE, related_name="ablocks")
    """The ulabel to which the block is attached."""

    def __init__(self, *args, **kwargs):
        _init_versioned_attached_block(self, "ulabel", *args, **kwargs)


class BranchBlock(BaseBlock, BaseSQLRecord):
    """An unstructured notes block that can be attached to a branch."""

    class Meta:
        app_label = "lamindb"

    branch: Branch = ForeignKey(Branch, CASCADE, related_name="ablocks")
    """The branch to which the block is attached."""

    def __init__(self, *args, **kwargs):
        _init_versioned_attached_block(self, "branch", *args, **kwargs)


================================================
FILE: lamindb/models/can_curate.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Iterable, Literal, Union

import numpy as np
from django.core.exceptions import FieldDoesNotExist
from django.db.models import Manager, QuerySet
from lamin_utils import colors, logger

from lamindb.base.utils import strict_classmethod

from ..errors import ValidationError
from ._from_values import (
    _format_values,
    _from_values,
    get_organism_record_from_field,
)
from .sqlrecord import SQLRecord, get_name_field

if TYPE_CHECKING:
    from lamin_utils._inspect import InspectResult
    from pandas import DataFrame

    from lamindb.base.types import ListLike, StrField

    from .query_set import SQLRecordList


def _check_if_record_in_db(record: str | SQLRecord | None, using_key: str | None):
    """Check if the record is from the using_key DB."""
    if isinstance(record, SQLRecord):
        if using_key is not None and using_key != "default":
            if record._state.db != using_key:
                raise ValueError(
                    f"record must be a {record.__class__.__get_name_with_module__()} record from instance '{using_key}'!"
                )


def _concat_lists(values: ListLike | str) -> list[str]:
    """Concatenate a list of lists of strings into a single list."""
    import pandas as pd

    if isinstance(values, str):
        values = [values]
    if isinstance(values, (list, pd.Series)) and len(values) > 0:
        first_item = values[0] if isinstance(values, list) else values.iloc[0]
        if isinstance(first_item, list):
            if isinstance(values, pd.Series):
                values = values.tolist()
            values = [
                v for sublist in values if isinstance(sublist, list) for v in sublist
            ]
    return values


def _inspect(
    cls,
    values: ListLike,
    field: StrField | None = None,
    *,
    mute: bool = False,
    organism: str | SQLRecord | None = None,
    source: SQLRecord | None = None,
    from_source: bool = True,
    strict_source: bool = False,
) -> DataFrame | dict[str, list[str]]:
    """{}"""  # noqa: D415
    from lamin_utils._inspect import inspect

    values = _concat_lists(values)

    field_str = get_name_field(cls, field=field)
    queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.filter().all()
    registry = queryset.model
    model_name = registry._meta.model.__name__
    if isinstance(source, SQLRecord):
        _check_if_record_in_db(source, queryset.db)
        # if strict_source mode, restrict the query to the passed ontology source
        # otherwise, inspect across records present in the DB from all ontology sources and no-source
        if strict_source:
            queryset = queryset.filter(source=source)
    organism_record = get_organism_record_from_field(
        getattr(registry, field_str), organism, values, queryset.db
    )
    _check_if_record_in_db(organism_record, queryset.db)

    # do not inspect synonyms if the field is not name field
    standardize = True
    if hasattr(registry, "_name_field") and field_str != registry._name_field:
        standardize = False

    # inspect in the DB
    result_db = inspect(
        df=_filter_queryset_with_organism(queryset=queryset, organism=organism_record),
        identifiers=values,
        field=field_str,
        standardize=standardize,
        mute=mute,
    )
    nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())

    if from_source and len(nonval) > 0 and hasattr(registry, "source_id"):
        try:
            public_result = registry.public(
                organism=organism_record, source=source
            ).inspect(
                values=nonval,
                field=field_str,
                mute=True,
                standardize=standardize,
            )
            public_validated = public_result.validated
            public_mapper = public_result.synonyms_mapper
            hint = False
            if len(public_validated) > 0 and not mute:
                print_values = _format_values(public_validated)
                s = "" if len(public_validated) == 1 else "s"
                labels = colors.yellow(f"{len(public_validated)} {model_name} term{s}")
                logger.print(
                    f"   detected {labels} in public source for"
                    f" {colors.italic(field_str)}: {colors.yellow(print_values)}"
                )
                hint = True

            if len(public_mapper) > 0 and not mute:
                print_values = _format_values(list(public_mapper.keys()))
                s = "" if len(public_mapper) == 1 else "s"
                labels = colors.yellow(f"{len(public_mapper)} {model_name} term{s}")
                logger.print(
                    f"   detected {labels} in public source as {colors.italic(f'synonym{s}')}:"
                    f" {colors.yellow(print_values)}"
                )
                hint = True

            if hint:
                logger.print(
                    f"→  add records from public source to your {model_name} registry via"
                    f" {colors.italic('.from_values()')}"
                )

            nonval = [i for i in public_result.non_validated if i not in public_mapper]  # type: ignore
        # no public source is found
        except ValueError:
            logger.warning("no public source found, skipping source validation")

    if len(nonval) > 0 and not mute:
        print_values = _format_values(list(nonval))
        s = "" if len(nonval) == 1 else "s"
        labels = colors.red(f"{len(nonval)} term{s}")
        logger.print(f"   couldn't validate {labels}: {colors.red(print_values)}")
        logger.print(
            f"→  if you are sure, create new record{s} via"
            f" {colors.italic(f'{registry.__name__}()')} and save to your registry"
        )

    return result_db


def _validate(
    cls,
    values: ListLike,
    field: StrField | None = None,
    *,
    mute: bool = False,
    organism: str | SQLRecord | None = None,
    source: SQLRecord | None = None,
    strict_source: bool = False,
) -> np.ndarray:
    """{}"""  # noqa: D415
    import pandas as pd
    from lamin_utils._inspect import validate

    return_str = True if isinstance(values, str) else False
    values = _concat_lists(values)

    field_str = get_name_field(cls, field=field)

    queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.filter().all()
    registry = queryset.model
    if isinstance(source, SQLRecord):
        _check_if_record_in_db(source, queryset.db)
        if strict_source:
            queryset = queryset.filter(source=source)

    organism_record = get_organism_record_from_field(
        getattr(registry, field_str), organism, values, queryset.db
    )
    _check_if_record_in_db(organism_record, queryset.db)
    field_values = pd.Series(
        _filter_queryset_with_organism(
            queryset=queryset,
            organism=organism_record,
            values_list_field=field_str,
        ),
        dtype="object",
    )
    if field_values.empty:
        if not mute:
            msg = f"Your {queryset.model.__name__} registry is empty, consider populating it first!"
            if hasattr(queryset.model, "source_id"):
                msg += "\n   → use `.import_source()` to import records from a source, e.g. a public ontology"
            logger.warning(msg)
        return np.array([False] * len(values))

    result = validate(
        identifiers=values,
        field_values=field_values,
        case_sensitive=True,
        mute=mute,
        field=field_str,
    )
    if return_str and len(result) == 1:
        return result[0]
    else:
        return result


def _standardize(
    cls,
    values: ListLike,
    field: StrField | None = None,
    *,
    return_field: str = None,
    return_mapper: bool = False,
    case_sensitive: bool = False,
    mute: bool = False,
    from_source: bool = True,
    keep: Literal["first", "last", False] = "first",
    synonyms_field: str = "synonyms",
    organism: str | SQLRecord | None = None,
    source: SQLRecord | None = None,
    strict_source: bool = False,
) -> list[str] | dict[str, str]:
    """{}"""  # noqa: D415
    import pandas as pd
    from lamin_utils._standardize import standardize as map_synonyms

    return_str = True if isinstance(values, str) else False
    values = _concat_lists(values)

    field_str = get_name_field(cls, field=field)
    return_field_str = get_name_field(
        cls, field=field if return_field is None else return_field
    )
    queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.filter().all()
    registry = queryset.model
    if isinstance(source, SQLRecord):
        _check_if_record_in_db(source, queryset.db)
        if strict_source:
            queryset = queryset.filter(source=source)
    organism_record = get_organism_record_from_field(
        getattr(registry, field_str), organism, values, queryset.db
    )
    _check_if_record_in_db(organism_record, queryset.db)

    # only perform synonym mapping if field is the name field
    if hasattr(registry, "_name_field") and field_str != registry._name_field:
        synonyms_field = None

    try:
        registry._meta.get_field(synonyms_field)
        fields = {
            field_name
            for field_name in [field_str, return_field_str, synonyms_field]
            if field_name is not None
        }
        df = _filter_queryset_with_organism(
            queryset=queryset,
            organism=organism_record,
            values_list_fields=list(fields),
        )
    except FieldDoesNotExist:
        df = pd.DataFrame()

    _kwargs = {
        "field": field_str,
        "return_field": return_field_str,
        "case_sensitive": case_sensitive,
        "keep": keep,
        "synonyms_field": synonyms_field,
    }
    # standardized names from the DB
    std_names_db = map_synonyms(
        df=df,
        identifiers=values,
        return_mapper=return_mapper,
        mute=mute,
        **_kwargs,
    )

    def _return(result: list, mapper: dict):
        if return_mapper:
            return mapper
        else:
            if return_str and len(result) == 1:
                return result[0]
            return result

    # map synonyms in public source
    if hasattr(registry, "source_id") and from_source:
        mapper = {}
        if return_mapper:
            mapper = std_names_db
            std_names_db = map_synonyms(
                df=df, identifiers=values, return_mapper=False, mute=True, **_kwargs
            )

        val_res = registry.validate(
            std_names_db, field=field, mute=True, organism=organism_record
        )
        if all(val_res):
            return _return(result=std_names_db, mapper=mapper)

        nonval = np.array(std_names_db)[~val_res]
        std_names_bt_mapper = registry.public(
            organism=organism_record, source=source
        ).standardize(nonval, return_mapper=True, mute=True, **_kwargs)

        if len(std_names_bt_mapper) > 0 and not mute:
            s = "" if len(std_names_bt_mapper) == 1 else "s"
            field_print = "synonym" if field_str == return_field_str else field_str

            reduced_mapped_keys_str = f"{list(std_names_bt_mapper.keys())[:10] + ['...'] if len(std_names_bt_mapper) > 10 else list(std_names_bt_mapper.keys())}"
            truncated_note = (
                " (output truncated)" if len(std_names_bt_mapper) > 10 else ""
            )

            warn_msg = (
                f"found {len(std_names_bt_mapper)} {field_print}{s} in public source{truncated_note}:"
                f" {reduced_mapped_keys_str}\n"
                f"  please add corresponding {registry._meta.model.__name__} records via{truncated_note}:"
                f" `.from_values({reduced_mapped_keys_str})`"
            )

            logger.warning(warn_msg)

        mapper.update(std_names_bt_mapper)
        if hasattr(std_names_db, "dtype") and isinstance(
            std_names_db.dtype, pd.CategoricalDtype
        ):
            result = std_names_db.cat.rename_categories(std_names_bt_mapper).tolist()
        else:
            result = pd.Series(std_names_db).replace(std_names_bt_mapper).tolist()
        return _return(result=result, mapper=mapper)

    else:
        return _return(result=std_names_db, mapper=std_names_db)


def _add_or_remove_synonyms(
    synonym: str | ListLike,
    record: CanCurate,
    action: Literal["add", "remove"],
    force: bool = False,
    save: bool | None = None,
):
    """Add or remove synonyms."""

    def check_synonyms_in_all_records(synonyms: set[str], record: CanCurate):
        """Errors if input synonym is associated with other records in the DB."""
        import pandas as pd
        from IPython.display import display

        syns_all = (
            record.__class__.filter().exclude(synonyms="").exclude(synonyms=None)  # type: ignore
        )
        if len(syns_all) == 0:
            return
        df = pd.DataFrame(syns_all.values())
        df["synonyms"] = df["synonyms"].str.split("|")
        df = df.explode("synonyms")
        matches_df = df[(df["synonyms"].isin(synonyms)) & (df["id"] != record.id)]  # type: ignore
        if matches_df.shape[0] > 0:
            records_df = pd.DataFrame(syns_all.filter(id__in=matches_df["id"]).values())
            logger.error(
                f"input synonyms {matches_df['synonyms'].unique()} already associated"
                " with the following records:\n"
            )
            display(records_df)
            raise ValidationError(
                f"you are trying to assign a synonym to record: {record}\n"
                "    → consider removing the synonym from existing records or using a different synonym."
            )

    # passed synonyms
    # nothing happens when passing an empty string or list
    if isinstance(synonym, str):
        if len(synonym) == 0:
            return
        syn_new_set = {synonym}
    else:
        if synonym == [""]:
            return
        syn_new_set = set(synonym)
    # nothing happens when passing an empty string or list
    if len(syn_new_set) == 0:
        return
    # because we use | as the separator
    if any("|" in i for i in syn_new_set):
        raise ValidationError("a synonym can't contain '|'!")

    # existing synonyms
    syns_exist = record.synonyms  # type: ignore
    if syns_exist is None or len(syns_exist) == 0:
        syns_exist_set = set()
    else:
        syns_exist_set = set(syns_exist.split("|"))

    if action == "add":
        if not force:
            check_synonyms_in_all_records(syn_new_set, record)
        syns_exist_set.update(syn_new_set)
    elif action == "remove":
        syns_exist_set = syns_exist_set.difference(syn_new_set)

    if len(syns_exist_set) == 0:
        syns_str = None
    else:
        syns_str = "|".join(syns_exist_set)

    record.synonyms = syns_str  # type: ignore

    if save is None:
        # if record is already in DB, save the changes to DB
        save = not record._state.adding  # type: ignore
    if save:
        record.save()  # type: ignore


def _check_synonyms_field_exist(record: CanCurate):
    """Check if synonyms field exists."""
    if not hasattr(record, "synonyms"):
        raise NotImplementedError(
            f"No synonyms field found in table {record.__class__.__name__}!"
        ) from None


def _filter_queryset_with_organism(
    queryset: QuerySet,
    organism: SQLRecord | None = None,
    values_list_field: str | None = None,
    values_list_fields: list[str] | None = None,
):
    """Filter a queryset based on organism."""
    import pandas as pd

    if organism is not None:
        queryset = queryset.filter(organism=organism)

    # values_list_field/s for better performance
    if values_list_field is None:
        if values_list_fields:
            return pd.DataFrame.from_records(
                queryset.values_list(*values_list_fields), columns=values_list_fields
            )
        return pd.DataFrame.from_records(queryset.values())
    else:
        return queryset.values_list(values_list_field, flat=True)


class CanCurate:
    """Base class providing :class:`~lamindb.models.SQLRecord`-based validation."""

    @strict_classmethod
    def inspect(
        cls,
        values: ListLike,
        field: StrField | None = None,
        *,
        mute: bool = False,
        organism: Union[str, SQLRecord, None] = None,
        source: SQLRecord | None = None,
        from_source: bool = True,
        strict_source: bool = False,
    ) -> InspectResult:
        """Inspect if values are mappable to a field.

        Being mappable means that an exact match exists.

        Args:
            values: Values that will be checked against the field.
            field: The field of values. Examples are `'ontology_id'` to map
                against the source ID or `'name'` to map against the ontologies
                field names.
            mute: Whether to mute logging.
            organism: An Organism name or record.
            source: A `bionty.Source` record that specifies the version to inspect against.
            strict_source: Determines the validation behavior against records in the registry.
                - If `False`, validation will include all records in the registry, ignoring the specified source.
                - If `True`, validation will only include records in the registry  that are linked to the specified source.
                Note: this parameter won't affect validation against public sources.

        See Also:
            :meth:`~lamindb.models.CanCurate.validate`

        Example::

            import bionty as bt

            # save some gene records
            bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol", organism="human").save()

            # inspect gene symbols
            gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
            result = bt.Gene.inspect(gene_symbols, field=bt.Gene.symbol, organism="human")
            assert result.validated == ["A1CF", "A1BG"]
            assert result.non_validated == ["FANCD1", "FANCD20"]
        """
        return _inspect(
            cls=cls,
            values=values,
            field=field,
            mute=mute,
            strict_source=strict_source,
            organism=organism,
            source=source,
            from_source=from_source,
        )

    @strict_classmethod
    def validate(
        cls,
        values: ListLike,
        field: StrField | None = None,
        *,
        mute: bool = False,
        organism: Union[str, SQLRecord, None] = None,
        source: SQLRecord | None = None,
        strict_source: bool = False,
    ) -> np.ndarray:
        """Validate values against existing values of a string field.

        Note this is strict_source validation, only asserts exact matches.

        Args:
            values: Values that will be validated against the field.
            field: The field of values.
                    Examples are `'ontology_id'` to map against the source ID
                    or `'name'` to map against the ontologies field names.
            mute: Whether to mute logging.
            organism: An Organism name or record.
            source: A `bionty.Source` record that specifies the version to validate against.
            strict_source: Determines the validation behavior against records in the registry.
                - If `False`, validation will include all records in the registry, ignoring the specified source.
                - If `True`, validation will only include records in the registry  that are linked to the specified source.
                Note: this parameter won't affect validation against public sources.

        Returns:
            A vector of booleans indicating if an element is validated.

        See Also:
            :meth:`~lamindb.models.CanCurate.inspect`

        Example::

            import bionty as bt

            bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol", organism="human").save()

            gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
            bt.Gene.validate(gene_symbols, field=bt.Gene.symbol, organism="human")
            #> array([ True,  True, False, False])
        """
        return _validate(
            cls=cls,
            values=values,
            field=field,
            mute=mute,
            strict_source=strict_source,
            organism=organism,
            source=source,
        )

    @strict_classmethod
    def from_values(
        cls,
        values: ListLike,
        field: StrField | None = None,
        create: bool = False,
        organism: Union[SQLRecord, str, None] = None,
        source: SQLRecord | None = None,
        standardize: bool = True,
        from_source: bool = True,
        mute: bool = False,
    ) -> SQLRecordList:
        """Bulk create validated records by parsing values for an identifier such as a name or an id).

        Args:
            values: A list of values for an identifier, e.g. `["name1", "name2"]`.
            field: A `SQLRecord` field to look up, e.g., `bt.CellMarker.name`.
            create: Whether to create records if they don't exist.
            organism: A `bionty.Organism` name or record.
            source: A `bionty.Source` record to validate against to create records for.
            standardize: Whether to standardize synonyms in the values.
            from_source: Whether to create records from public source.
            mute: Whether to mute logging.

        Returns:
            A list of validated records. For bionty registries. Also returns knowledge-coupled records.

        Notes:
            For more info, see tutorial: :doc:`docs:manage-ontologies`.

        Example::

            import bionty as bt

            # Bulk create from non-validated values will log warnings & returns empty list
            ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"])
            assert len(ulabels) == 0

            # Bulk create records from validated values returns the corresponding existing records
            ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"], create=True).save()
            assert len(ulabels) == 3

            # Bulk create records from public reference
            bt.CellType.from_values(["T cell", "B cell"]).save()
        """
        return _from_values(
            iterable=values,
            field=getattr(cls, get_name_field(cls, field=field)),
            create=create,
            organism=organism,
            source=source,
            mute=mute,
        )

    @strict_classmethod
    def standardize(
        cls,
        values: Iterable,
        field: StrField | None = None,
        *,
        return_field: StrField | None = None,
        return_mapper: bool = False,
        case_sensitive: bool = False,
        mute: bool = False,
        from_source: bool = True,
        keep: Literal["first", "last", False] = "first",
        synonyms_field: str = "synonyms",
        organism: Union[str, SQLRecord, None] = None,
        source: SQLRecord | None = None,
        strict_source: bool = False,
    ) -> list[str] | dict[str, str]:
        """Maps input synonyms to standardized names.

        Args:
            values: Identifiers that will be standardized.
            field: The field representing the standardized names.
            return_field: The field to return. Defaults to field.
            return_mapper: If `True`, returns `{input_value: standardized_name}`.
            case_sensitive: Whether the mapping is case sensitive.
            mute: Whether to mute logging.
            from_source: Whether to standardize from public source. Defaults to `True` for BioRecord registries.
            keep: When a synonym maps to multiple names, determines which duplicates to mark as `pd.DataFrame.duplicated`:
                - `"first"`: returns the first mapped standardized name
                - `"last"`: returns the last mapped standardized name
                - `False`: returns all mapped standardized name.

                When `keep` is `False`, the returned list of standardized names will contain nested lists in case of duplicates.

                When a field is converted into return_field, keep marks which matches to keep when multiple return_field values map to the same field value.
            synonyms_field: A field containing the concatenated synonyms.
            organism: An Organism name or record.
            source: A `bionty.Source` record that specifies the version to validate against.
            strict_source: Determines the validation behavior against records in the registry.
                - If `False`, validation will include all records in the registry, ignoring the specified source.
                - If `True`, validation will only include records in the registry  that are linked to the specified source.
                Note: this parameter won't affect validation against public sources.

        Returns:
            If `return_mapper` is `False`: a list of standardized names. Otherwise,
            a dictionary of mapped values with mappable synonyms as keys and
            standardized names as values.

        See Also:
            :meth:`~lamindb.models.CanCurate.add_synonym`
                Add synonyms.
            :meth:`~lamindb.models.CanCurate.remove_synonym`
                Remove synonyms.

        Example::

            import bionty as bt

            # save some gene records
            bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol", organism="human").save()

            # standardize gene synonyms
            gene_synonyms = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
            bt.Gene.standardize(gene_synonyms)
            #> ['A1CF', 'A1BG', 'BRCA2', 'FANCD20']
        """
        return _standardize(
            cls=cls,
            values=values,
            field=field,
            return_field=return_field,
            return_mapper=return_mapper,
            case_sensitive=case_sensitive,
            mute=mute,
            strict_source=strict_source,
            from_source=from_source,
            keep=keep,
            synonyms_field=synonyms_field,
            organism=organism,
            source=source,
        )

    def add_synonym(
        self,
        synonym: str | ListLike,
        force: bool = False,
        save: bool | None = None,
    ):
        """Add synonyms to a record.

        Args:
            synonym: The synonyms to add to the record.
            force: Whether to add synonyms even if they are already synonyms of other records.
            save: Whether to save the record to the database.

        See Also:
            :meth:`~lamindb.models.CanCurate.remove_synonym`
                Remove synonyms.

        Example::

            import bionty as bt

            # save "T cell" record
            record = bt.CellType.from_source(name="T cell").save()
            record.synonyms
            #> "T-cell|T lymphocyte|T-lymphocyte"

            # add a synonym
            record.add_synonym("T cells")
            record.synonyms
            #> "T cells|T-cell|T-lymphocyte|T lymphocyte"
        """
        _check_synonyms_field_exist(self)
        _add_or_remove_synonyms(
            synonym=synonym, record=self, force=force, action="add", save=save
        )

    def remove_synonym(self, synonym: str | ListLike):
        """Remove synonyms from a record.

        Args:
            synonym: The synonym values to remove.

        See Also:
            :meth:`~lamindb.models.CanCurate.add_synonym`
                Add synonyms

        Example::

            import bionty as bt

            # save "T cell" record
            record = bt.CellType.from_source(name="T cell").save()
            record.synonyms
            #> "T-cell|T lymphocyte|T-lymphocyte"

            # remove a synonym
            record.remove_synonym("T-cell")
            record.synonyms
            #> "T lymphocyte|T-lymphocyte"
        """
        _check_synonyms_field_exist(self)
        _add_or_remove_synonyms(synonym=synonym, record=self, action="remove")

    def set_abbr(self, value: str):
        """Set value for abbr field and add to synonyms.

        Args:
            value: A value for an abbreviation.

        See Also:
            :meth:`~lamindb.models.CanCurate.add_synonym`

        Example::

            import bionty as bt

            # save an experimental factor record
            scrna = bt.ExperimentalFactor.from_source(name="single-cell RNA sequencing").save()
            assert scrna.abbr is None
            assert scrna.synonyms == "single-cell RNA-seq|single-cell transcriptome sequencing|scRNA-seq|single cell RNA sequencing"

            # set abbreviation
            scrna.set_abbr("scRNA")
            assert scrna.abbr == "scRNA"
            # synonyms are updated
            assert scrna.synonyms == "scRNA|single-cell RNA-seq|single cell RNA sequencing|single-cell transcriptome sequencing|scRNA-seq"
        """
        self.abbr = value

        if hasattr(self, "name") and value == self.name:
            pass
        else:
            try:
                self.add_synonym(value, save=False)
            except Exception as e:  # pragma: no cover
                logger.debug(
                    f"Encountered an Exception while attempting to add synonyms.\n{e}"
                )

        if not self._state.adding:  # type: ignore
            self.save()  # type: ignore


================================================
FILE: lamindb/models/collection.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Literal, overload

from django.db import models
from django.db.models import CASCADE, PROTECT, Q
from lamin_utils import logger
from lamindb_setup.core.hashing import HASH_LENGTH, hash_set

from lamindb.base.fields import (
    CharField,
    ForeignKey,
    OneToOneField,
    TextField,
)
from lamindb.base.utils import strict_classmethod

from ..base.uids import base62_20
from ..errors import FieldValidationError
from ..models._is_versioned import process_revises
from ._is_versioned import IsVersioned
from .artifact import (
    Artifact,
    get_run,
    populate_subsequent_run,
    save_schema_links,
    track_run_input,
)
from .has_parents import view_lineage
from .run import Run, TracksRun, TracksUpdates
from .sqlrecord import (
    BaseSQLRecord,
    IsLink,
    SQLRecord,
    _get_record_kwargs,
    init_self_from_db,
    update_attributes,
)

if TYPE_CHECKING:
    from collections.abc import Iterable, Iterator

    import anndata as ad
    import pandas as pd
    from polars import LazyFrame as PolarsLazyFrame
    from pyarrow.dataset import Dataset as PyArrowDataset

    from ..core._mapped_collection import MappedCollection
    from ..core.storage import UPath
    from .block import CollectionBlock
    from .project import Project, Reference
    from .query_manager import RelatedManager
    from .query_set import QuerySet
    from .record import Record
    from .transform import Transform
    from .ulabel import ULabel


def _load_concat_artifacts(
    artifacts: list[Artifact], join: Literal["inner", "outer"] = "outer", **kwargs
) -> pd.DataFrame | ad.AnnData:
    import anndata as ad
    import pandas as pd

    suffixes = {artifact.suffix for artifact in artifacts}
    if len(suffixes) != 1:
        raise ValueError(
            "Can only load collections where all artifacts have the same suffix"
        )

    # because we're tracking data flow on the collection-level, here, we don't
    # want to track it on the artifact-level
    first_object = artifacts[0].load(is_run_input=False)
    is_dataframe = isinstance(first_object, pd.DataFrame)
    is_anndata = isinstance(first_object, ad.AnnData)
    if not is_dataframe and not is_anndata:
        raise ValueError(f"Unable to concatenate {suffixes.pop()} objects.")

    objects = [first_object]
    artifact_uids = [artifacts[0].uid]
    for artifact in artifacts[1:]:
        objects.append(artifact.load(is_run_input=False))
        artifact_uids.append(artifact.uid)

    if is_dataframe:
        concat_object = pd.concat(objects, join=join, **kwargs)
    elif is_anndata:
        label = kwargs.pop("label", "artifact_uid")
        keys = kwargs.pop("keys", artifact_uids)
        concat_object = ad.concat(objects, join=join, label=label, keys=keys, **kwargs)
    return concat_object


class Collection(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
    """Versioned collections of artifacts.

    Args:
        artifacts: `Artifact | list[Artifact]` One or several artifacts.
        key: `str` A file-path like key, analogous to the `key` parameter of `Artifact` and `Transform`.
        description: `str | None = None` A description.
        meta: `Artifact | None = None` An artifact that defines metadata for the collection.
        reference: `str | None = None` A simple reference, e.g. an external ID or a URL.
        reference_type: `str | None = None` A way to indicate to indicate the type of the simple reference `"url"`.
        run: `Run | None = None` The run that creates the collection.
        revises: `Collection | None = None` An old version of the collection.
        skip_hash_lookup: `bool = False` Skip the hash lookup so that a new collection is created even if a collection with the same hash already exists.


    See Also:
        :class:`~lamindb.Artifact`

    Examples:

        Create a collection from a list of :class:`~lamindb.Artifact` objects::

            collection = ln.Collection([artifact1, artifact2], key="my_project/my_collection")

        Create a collection that groups a data & a metadata artifact (e.g., here :doc:`docs:rxrx`)::

            collection = ln.Collection(data_artifact, key="my_project/my_collection", meta=metadata_artifact)

    """

    class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
        abstract = False
        app_label = "lamindb"
        constraints = [
            models.UniqueConstraint(
                fields=["key", "hash"],
                name="unique_collection_key_hash_not_null",
            )
        ]

    _len_full_uid: int = 20
    _len_stem_uid: int = 16
    _name_field: str = "key"

    id: int = models.AutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(
        editable=False,
        unique=True,
        db_index=True,
        max_length=_len_full_uid,
        default=base62_20,
    )
    """Universal id, valid across DB instances."""
    key: str = CharField(db_index=True)
    """Name or path-like key."""
    # below is the only case in which we use a TextField
    # for description; we do so because users had descriptions exceeding 255 chars
    # in their instances
    description: str | None = TextField(null=True)
    """A description or title."""
    hash: str | None = CharField(
        max_length=HASH_LENGTH,
        db_index=True,
        null=True,
    )
    """Hash of collection content."""
    reference: str | None = CharField(max_length=255, db_index=True, null=True)
    """A reference like URL or external ID."""
    # also for reference_type here, we allow an extra long max_length
    reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
    """Type of reference, e.g., cellxgene Census collection_id."""
    ulabels: RelatedManager[ULabel] = models.ManyToManyField(
        "ULabel", through="CollectionULabel", related_name="collections"
    )
    """ULabels annotating the collection (see :class:`~lamindb.Feature`) ← :attr:`~lamindb.ULabel.collections`."""
    run: Run | None = ForeignKey(
        Run, PROTECT, related_name="output_collections", null=True, default=None
    )
    """:class:`~lamindb.Run` that created the `collection` ← :attr:`~lamindb.Run.output_collections`."""
    input_of_runs: RelatedManager[Run] = models.ManyToManyField(
        Run, related_name="input_collections"
    )
    """Runs that use this collection as an input ← :attr:`~lamindb.Run.input_collections`."""
    recreating_runs: RelatedManager[Run] = models.ManyToManyField(
        "Run",
        related_name="recreated_collections",
    )
    """Runs that re-created the record after initial creation ← :attr:`~lamindb.Run.recreated_collections`."""
    artifacts: RelatedManager[Artifact] = models.ManyToManyField(
        "Artifact", related_name="collections", through="CollectionArtifact"
    )
    """Artifacts in collection ← :attr:`~lamindb.Artifact.collections`."""
    meta_artifact: Artifact | None = OneToOneField(
        "Artifact",
        PROTECT,
        null=True,
        unique=True,
        related_name="_meta_of_collection",
    )
    """An artifact that stores metadata that indexes a collection.

    It has a 1:1 correspondence with an artifact. If needed, you can access the
    collection from the artifact via a private field:
    `artifact._meta_of_collection`.
    """
    linked_in_records: RelatedManager[Record] = models.ManyToManyField(
        "Record", through="RecordCollection", related_name="linked_collections"
    )
    """This collection is linked in these records as a value ← :attr:`~lamindb.Record.linked_collections`."""
    _actions: RelatedManager[Artifact] = models.ManyToManyField(
        Artifact, related_name="+"
    )
    """Actions to attach for the UI."""
    projects: RelatedManager[Project]
    """Linked projects ← :attr:`~lamindb.Project.collections`."""
    references: RelatedManager[Reference]
    """Linked references ← :attr:`~lamindb.Reference.collections`."""
    records: RelatedManager[Record]
    """Linked records ← :attr:`~lamindb.Record.collections`."""
    ablocks: RelatedManager[CollectionBlock]
    """Attached blocks ← :attr:`~lamindb.CollectionBlock.collection`."""

    @overload
    def __init__(
        self,
        artifacts: Artifact | list[Artifact],
        key: str,
        description: str | None = None,
        meta: Any | None = None,
        reference: str | None = None,
        reference_type: str | None = None,
        run: Run | None = None,
        revises: Collection | None = None,
        skip_hash_lookup: bool = False,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        if len(args) == len(self._meta.concrete_fields):
            super().__init__(*args, **kwargs)
            return None
        # now we proceed with the user-facing constructor
        if len(args) > 1:
            raise ValueError("Only one non-keyword arg allowed: artifacts")
        artifacts: Artifact | list[Artifact] = (
            kwargs.pop("artifacts") if len(args) == 0 else args[0]
        )
        meta_artifact: Artifact | None = kwargs.pop("meta_artifact", None)
        key: str | None = kwargs.pop("key", None)
        description: str | None = kwargs.pop("description", None)
        reference: str | None = kwargs.pop("reference", None)
        reference_type: str | None = kwargs.pop("reference_type", None)
        run: Run | None = kwargs.pop("run", None)
        revises: Collection | None = kwargs.pop("revises", None)
        version_tag: str | None = kwargs.pop("version_tag", kwargs.pop("version", None))
        skip_hash_lookup: bool = kwargs.pop("skip_hash_lookup", False)
        branch = kwargs.pop("branch", None)
        branch_id = kwargs.pop("branch_id", 1)
        space = kwargs.pop("space", None)
        space_id = kwargs.pop("space_id", 1)
        if not len(kwargs) == 0:
            valid_keywords = ", ".join(
                [val[0] for val in _get_record_kwargs(Collection)]
            )
            raise FieldValidationError(
                f"Only {valid_keywords} can be passed, you passed: {kwargs}"
            )
        if revises is None:
            revises = (
                Collection.filter(key=key, is_latest=True)
                .order_by("-created_at")
                .first()
            )
        provisional_uid, version_tag, key, description, revises = process_revises(
            revises, version_tag, key, description, Collection
        )
        run = get_run(run)
        if isinstance(artifacts, Artifact):
            artifacts = [artifacts]
        else:
            if not hasattr(artifacts, "__getitem__"):
                raise ValueError("Artifact or list[Artifact] is allowed.")
            assert isinstance(artifacts[0], Artifact)  # type: ignore  # noqa: S101
        hash = from_artifacts(artifacts)  # type: ignore
        if meta_artifact is not None:
            if not isinstance(meta_artifact, Artifact):
                raise ValueError("meta_artifact has to be an Artifact")
            if isinstance(meta_artifact, Artifact):
                if meta_artifact._state.adding:
                    raise ValueError(
                        "Save meta_artifact artifact before creating collection!"
                    )
        # we ignore collections in trash containing the same hash
        if hash is not None and not skip_hash_lookup:
            # this purposefully leaves out the key that we have
            # in the hard database unique constraint
            # so that the user is able to find collections with the same hash across
            # keys
            # if this is not desired, set skip_hash_lookup=True
            existing_collection = Collection.objects.filter(
                ~Q(branch_id=-1),
                hash=hash,
            ).first()
        else:
            existing_collection = None
        if existing_collection is not None:
            logger.warning(
                f"returning collection with same hash: {existing_collection}; if you intended to query to track this collection as an input, use: ln.Collection.get()"
            )
            init_self_from_db(self, existing_collection)
            update_attributes(self, {"description": description, "key": key})
            populate_subsequent_run(self, run)
        else:
            _skip_validation = revises is not None and key == revises.key
            super().__init__(  # type: ignore
                uid=provisional_uid,
                key=key,
                description=description,
                reference=reference,
                reference_type=reference_type,
                meta_artifact=meta_artifact,
                hash=hash,
                run=run,
                version_tag=version_tag,
                branch=branch,
                branch_id=branch_id,
                space=space,
                space_id=space_id,
                revises=revises,
                _skip_validation=_skip_validation,
            )
        self._artifacts = artifacts
        if revises is not None and revises.uid != self.uid:
            track_run_input(revises, run=run)
        track_run_input(artifacts, run=run)

    @strict_classmethod
    def get(
        cls,
        idlike: int | str | None = None,
        *,
        is_run_input: bool | Run = False,
        **expressions,
    ) -> Artifact:
        """Get a single collection.

        Args:
            idlike: Either a uid stub, uid or an integer id.
            is_run_input: Whether to track this collection as run input.
            expressions: Fields and values passed as Django query expressions.

        Raises:
            :exc:`lamindb.errors.DoesNotExist`: In case no matching record is found.

        See Also:
            - Method in `SQLRecord` base class: :meth:`~lamindb.models.SQLRecord.get`

        Examples:

            ::

                collection = ln.Collection.get("okxPW6GIKBfRBE3B0000")
                collection = ln.Collection.get(key="scrna/collection1")
        """
        from .query_set import QuerySet

        return QuerySet(model=cls).get(idlike, is_run_input=is_run_input, **expressions)

    def append(self, artifact: Artifact, run: Run | None = None) -> Collection:
        """Append an artifact to the collection.

        This does not modify the original collection in-place, but returns a new version
        of the original collection with the appended artifact.

        Args:
            artifact: An artifact to add to the collection.
            run: The run that creates the new version of the collection.

        Examples:

            ::

                collection_v1 = ln.Collection(artifact, key="My collection").save()
                collection_v2 = collection.append(another_artifact)  # returns a new version of the collection
                collection_v2.save()  # save the new version

        """
        return Collection(  # type: ignore
            self.artifacts.all().to_list() + [artifact],
            # key is automatically derived from revises.key
            description=self.description,
            revises=self,
            run=run,
        )

    def open(
        self,
        engine: Literal["pyarrow", "polars"] = "pyarrow",
        is_run_input: bool | None = None,
        **kwargs,
    ) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
        """Open a dataset for streaming.

        Works for `pyarrow` and `polars` compatible formats
        (`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).

        Args:
            engine: Which module to use for lazy loading of a dataframe
                from `pyarrow` or `polars` compatible formats.
            is_run_input: Whether to track this artifact as run input.
            **kwargs: Keyword arguments for `pyarrow.dataset.dataset` or `polars.scan_*` functions.

        Notes:
            For more info, see guide: :doc:`/arrays`.
        """
        if self._state.adding:
            artifacts = self._artifacts
            logger.warning("the collection isn't saved, consider calling `.save()`")
        else:
            artifacts = self.ordered_artifacts.all()
        paths = [artifact.path for artifact in artifacts]

        from ..core.storage._backed_access import _open_dataframe

        dataframe = _open_dataframe(paths, engine=engine, **kwargs)
        # track only if successful
        track_run_input(self, is_run_input)
        return dataframe

    def mapped(
        self,
        layers_keys: str | list[str] | None = None,
        obs_keys: str | list[str] | None = None,
        obsm_keys: str | list[str] | None = None,
        obs_filter: dict[str, str | list[str]] | None = None,
        join: Literal["inner", "outer"] | None = "inner",
        encode_labels: bool | list[str] = True,
        unknown_label: str | dict[str, str] | None = None,
        cache_categories: bool = True,
        parallel: bool = False,
        dtype: str | None = None,
        stream: bool = False,
        is_run_input: bool | None = None,
    ) -> MappedCollection:
        """Return a map-style dataset.

        Returns a `pytorch map-style dataset
        <https://pytorch.org/docs/stable/data.html#map-style-datasets>`__ by
        virtually concatenating `AnnData` arrays.

        By default (`stream=False`) `AnnData` arrays are moved into a local
        cache first.

        `__getitem__` of the `MappedCollection` object takes a single integer index
        and returns a dictionary with the observation data sample for this index from
        the `AnnData` objects in the collection. The dictionary has keys for `layers_keys`
        (`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"`
        for the index of the `AnnData` object containing this observation sample.

        .. note::

            For a guide, see :doc:`docs:scrna-mappedcollection`.

            This method currently only works for collections or query sets of `AnnData` artifacts.

        Args:
            layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
                retrieves ``.X``.
            obs_keys: Keys from the ``.obs`` slots.
            obsm_keys: Keys from the ``.obsm`` slots.
            obs_filter: Select only observations with these values for the given obs columns.
                Should be a dictionary with obs column names as keys
                and filtering values (a string or a list of strings) as values.
            join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
                does not join.
            encode_labels: Encode labels into integers.
                Can be a list with elements from ``obs_keys``.
            unknown_label: Encode this label to -1.
                Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``
                or from ``encode_labels`` if it is a list.
            cache_categories: Enable caching categories of ``obs_keys`` for faster access.
            parallel: Enable sampling with multiple processes.
            dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
            stream: Whether to stream data from the array backend.
            is_run_input: Whether to track this collection as run input.

        Examples:
            >>> import lamindb as ln
            >>> from torch.utils.data import DataLoader
            >>> ds = ln.Collection.get(description="my collection")
            >>> mapped = collection.mapped(obs_keys=["cell_type", "batch"])
            >>> dl = DataLoader(mapped, batch_size=128, shuffle=True)
            >>> # also works for query sets of artifacts, '...' represents some filtering condition
            >>> # additional filtering on artifacts of the collection
            >>> mapped = collection.artifacts.all().filter(...).order_by("-created_at").mapped()
            >>> # or directly from a query set of artifacts
            >>> mapped = ln.Artifact.filter(..., otype="AnnData").order_by("-created_at").mapped()
        """
        from ..core._mapped_collection import MappedCollection

        path_list = []
        if self._state.adding:
            artifacts = self._artifacts
            logger.warning("the collection isn't saved, consider calling `.save()`")
        else:
            artifacts = self.ordered_artifacts.all()
        for artifact in artifacts:
            if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
                logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
                continue
            elif not stream:
                path_list.append(artifact.cache())
            else:
                path_list.append(artifact.path)
        ds = MappedCollection(
            path_list,
            layers_keys,
            obs_keys,
            obsm_keys,
            obs_filter,
            join,
            encode_labels,
            unknown_label,
            cache_categories,
            parallel,
            dtype,
        )
        # track only if successful
        track_run_input(self, is_run_input)
        return ds

    def cache(self, is_run_input: bool | None = None) -> list[UPath]:
        """Download cloud artifacts in collection to local cache.

        Follows syncing logic: only downloads outdated artifacts.

        Returns ordered paths to locally cached on-disk artifacts via `.ordered_artifacts.all()`:

        Args:
            is_run_input: Whether to track this collection as run input.
        """
        path_list = []
        for artifact in self.ordered_artifacts.all():
            # do not want to track data lineage on the artifact level
            path_list.append(artifact.cache(is_run_input=False))
        track_run_input(self, is_run_input)
        return path_list

    def load(
        self,
        join: Literal["inner", "outer"] = "outer",
        is_run_input: bool | None = None,
        **kwargs,
    ) -> pd.DataFrame | ad.AnnData:
        """Cache and load to memory.

        Returns an in-memory concatenated `DataFrame` or `AnnData` object.
        """
        # cannot call track_run_input here, see comment further down
        artifacts = self.ordered_artifacts.all()
        concat_object = _load_concat_artifacts(artifacts, join, **kwargs)
        # only call it here because there might be errors during load or concat
        track_run_input(self, is_run_input)
        return concat_object

    def save(self, using: str | None = None) -> Collection:
        """Save the collection and underlying artifacts to database & storage.

        Args:
            using: The database to which you want to save.

        Examples:
            >>> collection = ln.Collection("./myfile.csv", name="myfile")
        """
        if self.meta_artifact is not None:
            self.meta_artifact.save()
        super().save()
        # we don't allow updating the collection of artifacts
        # if users want to update the set of artifacts, they
        # have to create a new collection
        if hasattr(self, "_artifacts"):
            links = [
                CollectionArtifact(collection_id=self.id, artifact_id=artifact.id)  # type: ignore
                for artifact in self._artifacts
            ]
            # the below seems to preserve the order of the list in the
            # auto-incrementing integer primary
            # merely using .artifacts.set(*...) doesn't achieve this
            # we need ignore_conflicts=True so that this won't error if links already exist
            CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)
        save_schema_links(self)
        if using is not None:
            logger.warning("using argument is ignored")
        return self

    def restore(self) -> None:
        """Restore collection record from trash.

        Examples:

            For any `Collection` object `collection`, call:

            >>> collection.restore()
        """
        self.branch_id = 1
        self.save()

    @property
    def transform(self) -> Transform | None:
        """Transform whose run created the collection."""
        return self.run.transform if self.run is not None else None

    @property
    def name(self) -> str:
        """Name of the collection.

        Splits `key` on `/` and returns the last element.
        """
        return self.key.split("/")[-1]

    @property
    def ordered_artifacts(self) -> QuerySet:
        """Ordered `QuerySet` of `.artifacts`.

        Accessing the many-to-many field `collection.artifacts` directly gives
        you non-deterministic order.

        Using the property `.ordered_artifacts` allows to iterate through a set
        that's ordered by the order of the list that created the collection.
        """
        return self.artifacts.order_by("links_collection__id")

    @property
    def data_artifact(self) -> Artifact | None:
        """Access to a single data artifact.

        If the collection has a single data & metadata artifact, this allows access via::

           collection.data_artifact  # first & only element of collection.artifacts
           collection.meta_artifact  # metadata

        """
        return self.artifacts.first()


# internal function, not exposed to user
def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
    # assert all artifacts are already saved
    saved = not any(artifact._state.adding for artifact in artifacts)
    if not saved:
        raise ValueError("Not all artifacts are yet saved, please save them")
    # validate consistency of hashes - we do not allow duplicate hashes
    hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]
    hashes_set = set(hashes)
    if len(hashes) != len(hashes_set):
        seen = set()
        non_unique = [x for x in hashes if x in seen or seen.add(x)]  # type: ignore
        logger.warning(
            f"your collection contains artifacts with non-unique hashes:  {non_unique}"
        )
    hash = hash_set(hashes_set)
    return hash


class CollectionArtifact(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    collection: Collection = ForeignKey(
        Collection, CASCADE, related_name="links_artifact"
    )
    artifact: Artifact = ForeignKey(Artifact, PROTECT, related_name="links_collection")

    class Meta:
        app_label = "lamindb"
        unique_together = ("collection", "artifact")


# mypy: ignore-errors
Collection.view_lineage = view_lineage


================================================
FILE: lamindb/models/feature.py
================================================
from __future__ import annotations

import importlib
import warnings
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, cast, get_args, overload

import numpy as np
import pgtrigger
from django.conf import settings as django_settings
from django.db import connection, models
from django.db.models import CASCADE, PROTECT
from django.db.models.query_utils import DeferredAttribute
from django.db.utils import IntegrityError as DjangoIntegrityError
from lamin_utils import logger
from lamindb_setup._init_instance import get_schema_module_name
from lamindb_setup.core import deprecated
from lamindb_setup.core.hashing import HASH_LENGTH, hash_dict, hash_string
from lamindb_setup.errors import (
    MODULE_WASNT_CONFIGURED_MESSAGE_TEMPLATE,
    ModuleWasntConfigured,
)

from lamindb.base.fields import (
    BooleanField,
    CharField,
    ForeignKey,
    JSONField,
    TextField,
)
from lamindb.base.types import DtypeStr, FieldAttr
from lamindb.errors import (
    FieldValidationError,
    IntegrityError,
    InvalidArgument,
    ValidationError,
)

from ..base.uids import base62_12
from ._relations import dict_module_name_to_model_name
from .can_curate import CanCurate
from .has_parents import _query_relatives
from .query_set import QuerySet, SQLRecordList
from .run import (
    TracksRun,
    TracksUpdates,
)
from .sqlrecord import BaseSQLRecord, HasType, Registry, SQLRecord, _get_record_kwargs

if TYPE_CHECKING:
    from collections.abc import Iterable

    import pandas as pd
    from pandas.core.dtypes.base import ExtensionDtype

    from .artifact import Artifact
    from .block import FeatureBlock
    from .project import Project
    from .query_manager import RelatedManager
    from .record import Record
    from .run import Run
    from .schema import Schema
    from .ulabel import ULabel

FEATURE_DTYPES = set(get_args(DtypeStr))


@dataclass(frozen=True)
class FeaturePredicate:
    """Predicate generated by comparing a Feature to a value."""

    feature: Feature
    comparator: str
    value: Any

    def __bool__(self) -> bool:
        raise TypeError(
            "Feature predicates cannot be used as booleans. "
            "Pass them into `.filter(...)`."
        )


def parse_dtype(
    dtype_str: str, check_exists: bool = False, old_format: bool = False
) -> list[dict[str, Any]]:
    """Parses feature data type string into a structured list of components."""
    from .artifact import Artifact

    allowed_dtypes = FEATURE_DTYPES

    # Handle list[...] types
    if dtype_str.startswith("list[") and dtype_str.endswith("]"):
        inner_dtype_str = dtype_str[5:-1]  # Remove "list[" and "]"
        # Recursively parse the inner type
        inner_result = parse_dtype(inner_dtype_str, old_format=old_format)
        # Add "list": True to each component
        for component in inner_result:
            if isinstance(component, dict):
                component["list"] = True  # type: ignore
        return inner_result

    is_composed_cat = dtype_str.startswith("cat[") and dtype_str.endswith("]")
    result: list[dict[str, Any]] = []
    # backward compatibility for bare "cat" dtype (deprecated)
    if dtype_str == "cat":
        return result
    if is_composed_cat:
        related_registries = dict_module_name_to_model_name(Artifact)
        registries_str = dtype_str.replace("cat[", "")[:-1]  # strip last ]
        if registries_str != "":
            registry_str_list = registries_str.split("|")
            for cat_single_dtype_str in registry_str_list:
                single_result = parse_cat_dtype(
                    cat_single_dtype_str,
                    related_registries=related_registries,
                    check_exists=check_exists,
                    old_format=old_format,
                )
                result.append(single_result)
    elif dtype_str not in allowed_dtypes:
        raise ValueError(
            f"dtype is '{dtype_str}' but has to be one of {FEATURE_DTYPES}!"
        )
    return result


def get_record_type_from_uid(
    registry: Registry,
    record_uid: str,
) -> SQLRecord:
    type_record: SQLRecord = registry.get(record_uid)

    if type_record.branch_id == -1:
        warning_msg = f"retrieving {registry.__name__} type '{type_record.name}' (uid='{record_uid}') from trash"
        logger.warning(warning_msg)

    if not type_record.is_type:
        raise InvalidArgument(
            f"The resolved {type_record.__class__.__name__} '{type_record.name}' (uid='{record_uid}') is not a type: is_type is False."
        )
    return type_record


def get_record_type_from_nested_subtypes(
    registry: Registry, subtypes_list: list[str], field_str: str
) -> SQLRecord:
    """Get a record type by querying nested subtypes using raw SQL.

    This function only works with Record or ULabel registries.
    """
    table_name = registry._meta.db_table
    final_name = subtypes_list[-1]

    # Build the SQL query with nested joins
    # For subtypes_list = ["A", "B", "C"], we want:
    # - Record with name="C"
    # - Its type has name="B"
    # - That type's type has name="A"

    params: list[str | bool]
    if len(subtypes_list) > 1:
        # Build nested joins for parent types
        parent_types = list(reversed(subtypes_list[:-1]))
        joins = []
        where_clauses = ["t0.name = %s"]  # Final record name
        params = [final_name]

        for i, parent_type_name in enumerate(parent_types):
            alias = f"t{i + 1}"
            prev_alias = f"t{i}"
            joins.append(
                f"INNER JOIN {table_name} {alias} ON {prev_alias}.type_id = {alias}.id"
            )
            where_clauses.append(f"{alias}.name = %s")
            where_clauses.append(f"{alias}.is_type = %s")
            params.extend([parent_type_name, True])

        join_clause = " ".join(joins)
        where_clause = " AND ".join(where_clauses)

        query = f"""
            SELECT t0.*
            FROM {table_name} t0
            {join_clause}
            WHERE {where_clause}
            LIMIT 1
        """
    else:
        # Single type, no parent - type must be NULL
        query = f"""
            SELECT *
            FROM {table_name}
            WHERE name = %s AND type_id IS NULL
            LIMIT 1
        """
        params = [final_name]

    try:
        with connection.cursor() as cursor:
            cursor.execute(query, params)
            columns = [col[0] for col in cursor.description]
            rows = cursor.fetchall()

            if not rows:
                raise IntegrityError(
                    f"No {registry.__name__} type found matching subtypes {subtypes_list} for field `.{field_str}`"
                )

            if len(rows) > 1:
                raise IntegrityError(
                    f"Multiple {registry.__name__} types found matching subtypes {subtypes_list} for field `.{field_str}`"
                )

            # Create a dictionary from the row data
            row_dict = dict(zip(columns, rows[0]))

            # Create a minimal mock object with only the fields we need
            # This avoids querying the database which may not have all columns during migrations
            # We create a simple object and set its class to the registry for proper error messages
            type_record: SQLRecord = object.__new__(registry)
            type_record.id = row_dict.get("id")
            type_record.uid = row_dict.get("uid")
            type_record.name = row_dict.get("name")
            type_record.is_type = row_dict.get("is_type", False)
            # Initialize _state attribute needed by Django models
            # Create a minimal state object with the required attributes
            state = type("ModelState", (), {"adding": False, "db": "default"})()
            type_record._state = state

    except IntegrityError:
        raise
    except Exception as e:
        raise IntegrityError(
            f"Error retrieving {registry.__name__} type with subtypes {subtypes_list} for field `.{field_str}`: {e}"
        ) from e

    if not type_record.is_type:
        raise InvalidArgument(
            f"The resolved {type_record.__class__.__name__} '{type_record.name}' for field `.{field_str}` is not a type: is_type is False."
        )
    return type_record


def dtype_as_object(dtype_str: str, old_format: bool = False) -> type | None:
    def _dtype_as_object_simple(dtype_str: str) -> type | None:
        if dtype_str == "str":
            return str
        elif dtype_str == "url":
            return str
        elif dtype_str == "int":
            return int
        elif dtype_str in ("float", "num"):
            return float
        elif dtype_str == "bool":
            return bool
        elif dtype_str == "date":
            from datetime import date

            return date
        elif dtype_str == "datetime":
            from datetime import datetime

            return datetime
        elif dtype_str.startswith("dict"):
            return dict
        return None

    if dtype_str is None:
        return None

    parsed_dtypes = parse_dtype(dtype_str, check_exists=True, old_format=old_format)
    if len(parsed_dtypes) > 0:
        dtype_objects = []
        for parsed_dtype in parsed_dtypes:
            if parsed_dtype.get("record_uid"):
                # return the subtype record for dtypes with record_uid
                dtype_object = get_record_type_from_uid(
                    parsed_dtype["registry"],
                    parsed_dtype["record_uid"],
                )
            elif parsed_dtype.get("subtypes_list"):
                dtype_object = get_record_type_from_nested_subtypes(
                    parsed_dtype["registry"],
                    parsed_dtype["subtypes_list"],
                    parsed_dtype["field"],
                )
            else:
                # return field for dtypes without record_uid, e.g. bt.CellType.ontology_id
                dtype_object = parsed_dtype["field"]
            # for list, returns list[SQLRecord]
            dtype_objects.append(
                list[dtype_object]  # type: ignore
                if "list" in parsed_dtype and parsed_dtype["list"]
                else dtype_object
            )
        return dtype_objects if len(dtype_objects) > 1 else dtype_objects[0]  # type: ignore
    elif dtype_str.startswith("list["):
        # for simple lists, returns list[python_type]
        dtype_simple_object = _dtype_as_object_simple(
            dtype_str.removeprefix("list[").removesuffix("]")
        )
        return (
            list[dtype_simple_object] if dtype_simple_object is not None else list  # type: ignore
        )
    else:
        return _dtype_as_object_simple(dtype_str)


def parse_cat_dtype(
    dtype_str: str,
    related_registries: dict[str, SQLRecord] | None = None,
    is_itype: bool = False,
    check_exists: bool = False,
    old_format: bool = False,
) -> dict[str, Any]:
    """Parses a categorical dtype string into its components (registry, field, subtypes)."""
    from .artifact import Artifact

    assert isinstance(dtype_str, str)  # noqa: S101
    if related_registries is None:
        related_registries = dict_module_name_to_model_name(Artifact)

    # Parse the string considering nested brackets
    parsed = parse_nested_brackets(dtype_str, old_format=old_format)
    registry_str = parsed["registry"]
    filter_str = parsed["filter_str"]
    field_str = parsed["field"]

    if not is_itype:
        if registry_str not in related_registries:
            raise ValidationError(
                f"'{registry_str}' is an invalid dtype, has to be registry, e.g. ULabel or bionty.CellType"
            )
        registry = related_registries[registry_str]
    else:
        if "." in registry_str:
            registry_str_split = registry_str.split(".")
            assert len(registry_str_split) == 2, registry_str  # noqa: S101
            module_name_attempt, class_name = registry_str_split
            module_name = get_schema_module_name(
                module_name_attempt, raise_import_error=False
            )
            if module_name is None:
                raise ModuleWasntConfigured(
                    MODULE_WASNT_CONFIGURED_MESSAGE_TEMPLATE.format(
                        module_name_attempt, module_name_attempt
                    )
                )
        else:
            module_name, class_name = "lamindb", registry_str
        module = importlib.import_module(module_name)
        registry = getattr(module, class_name)

    if field_str == "":
        field_str = registry._name_field if hasattr(registry, "_name_field") else "name"
    assert hasattr(registry, field_str), f"{registry} has no field {field_str}"

    record_uid = parsed.get("record_uid")
    subtypes_list = parsed.get("subtypes_list")

    # Handle old format (subtypes_list) or new format (record_uid)
    if subtypes_list and check_exists:
        # Old format: validate that the Record exists using nested subtypes
        # subtypes_list is guaranteed to be list[str] when present
        if isinstance(subtypes_list, list):
            get_record_type_from_nested_subtypes(
                registry, cast(list[str], subtypes_list), field_str
            )
    elif record_uid and check_exists:
        get_record_type_from_uid(registry, record_uid)

    if filter_str != "":
        # TODO: validate or process filter string
        pass
    result = {
        "registry": registry,  # should be typed as CanCurate
        "registry_str": registry_str,
        "filter_str": filter_str,
        "field_str": field_str,
        "field": getattr(registry, field_str),
    }

    # Add record_uid if it exists (new format)
    if record_uid:
        result["record_uid"] = record_uid

    # Add subtypes_list if it exists (old format)
    if subtypes_list:
        result["subtypes_list"] = subtypes_list

    return result


def parse_nested_brackets(dtype_str: str, old_format: bool = False) -> dict[str, Any]:
    """Parse dtype string with potentially nested brackets.

    Examples:
        "A" -> {"registry": "A", "filter_str": "", "field": ""}
        "A.field" -> {"registry": "A", "filter_str": "", "field": "field"}
        "Record[abcdefg123456]" -> {"registry": "Record", "filter_str": "", "field": "", "record_uid": "abcdefg123456"}
        "Record[abcdefg123456].name" -> {"registry": "Record", "filter_str": "", "field": "name", "record_uid": "abcdefg123456"}
        "bionty.Gene.ensembl_gene_id[source__id='abcd']" -> {"registry": "bionty.Gene", "filter_str": "source__id='abcd'", "field": "ensembl_gene_id"}

    Args:
        dtype_str: The dtype string to parse

    Returns:
        Dictionary with parsed components
    """
    if "[" not in dtype_str:
        # No brackets - handle simple cases like "A" or "A.field"
        if "." in dtype_str:
            parts = dtype_str.split(".")
            if len(parts) == 2 and parts[1][0].isupper():
                # bionty.CellType
                return {"registry": dtype_str, "filter_str": "", "field": ""}
            elif len(parts) == 3:
                # bionty.CellType.name
                return {
                    "registry": f"{parts[0]}.{parts[1]}",
                    "filter_str": "",
                    "field": parts[2],
                }
            else:
                # ULabel.name
                return {"registry": parts[0], "filter_str": "", "field": parts[1]}
        else:
            # Simple registry name
            return {"registry": dtype_str, "filter_str": "", "field": ""}

    # Find the first opening bracket
    first_bracket = dtype_str.index("[")
    # Handle case where registry_part contains a field (e.g., "bionty.Gene.ensembl_gene_id[filters]")
    registry_and_field = dtype_str[:first_bracket]
    if "." in registry_and_field:
        parts = registry_and_field.split(".")
        if len(parts) == 3:
            registry_part = f"{parts[0]}.{parts[1]}"
            pre_bracket_field = parts[2]
        else:
            registry_part = registry_and_field
            pre_bracket_field = ""
    else:
        registry_part = registry_and_field
        pre_bracket_field = ""

    # Find the matching closing bracket for the first opening bracket
    bracket_count = 0
    closing_bracket_pos = -1

    for i in range(first_bracket, len(dtype_str)):
        if dtype_str[i] == "[":
            bracket_count += 1
        elif dtype_str[i] == "]":
            bracket_count -= 1
            if bracket_count == 0:
                closing_bracket_pos = i
                break

    if closing_bracket_pos == -1:
        raise ValueError(f"Unmatched brackets in dtype string: {dtype_str}")

    # Extract content between brackets
    bracket_content = dtype_str[first_bracket + 1 : closing_bracket_pos]

    # Check for field after the closing bracket
    field_part = ""
    remainder = dtype_str[closing_bracket_pos + 1 :]
    if remainder.startswith("."):
        field_part = remainder[1:]  # Remove the dot

    # Use pre_bracket_field if no post_bracket field
    if not field_part and pre_bracket_field:
        field_part = pre_bracket_field

    # Extract UID, subtypes_list, or filter from bracket content
    # For UID-based format: Record[uid] or ULabel[uid] -> record_uid
    # For old name-based format: Record[Name] or Record[Parent[Child]] -> subtypes_list
    # For filter format: registry.field[filter] -> filter_str
    record_uid = None
    subtypes_list = None
    filter_str = ""

    # If registry is Record or ULabel, bracket content could be UID or name(s)
    if registry_part in ("Record", "ULabel"):
        if bracket_content:
            if old_format:
                # Old format with nested brackets like Record[Parent[Child]]
                extracted = extract_subtypes_and_filter(bracket_content)
                subtypes_list = extracted["subtypes_list"]
                filter_str = extracted["filter_str"]
            else:
                record_uid = bracket_content
    else:
        # For other registries, bracket content is a filter
        filter_str = bracket_content if bracket_content else ""

    result = {
        "registry": registry_part,
        "filter_str": filter_str,
        "field": field_part,
    }

    # Add record_uid if it exists (new format)
    if record_uid:
        result["record_uid"] = record_uid

    # Add subtypes_list if it exists (old format)
    if subtypes_list:
        result["subtypes_list"] = subtypes_list

    return result


def extract_subtypes_and_filter(subtype_str: str) -> dict[str, Any]:
    """Extract nested subtypes and optional filter from a nested subtype string.

    Examples:
        "B" -> {"subtypes_list": ["B"], "filter_str": ""}
        "B[C]" -> {"subtypes_list": ["B", "C"], "filter_str": ""}
        "B[C[filter='<value>']]" -> {"subtypes_list": ["B", "C"], "filter_str": "filter='<value>'"}
        "B[C[D]]" -> {"subtypes_list": ["B", "C", "D"], "filter_str": ""}
        "B[C[D[E]]]" -> {"subtypes_list": ["B", "C", "D", "E"], "filter_str": ""}
        "B[filter='value']" -> {"subtypes_list": ["B"], "filter_str": "filter='value'"}
        "Customer[UScustomer[region='US']]" -> {"subtypes_list": ["Customer", "UScustomer"], "filter_str": "region='US'"}

    Args:
        subtype_str: The subtype string with potential nesting

    Returns:
        Dictionary with subtypes_list and filter_str
    """
    subtypes: list[str] = []
    filter_str = ""
    current = subtype_str

    while current:
        if "[" not in current:
            # No more brackets
            if current and "=" not in current:
                # It's a subtype name
                subtypes.append(current)
            elif current and "=" in current:
                # It's a filter
                filter_str = current
            break

        # Find the first part before the bracket
        bracket_pos = current.index("[")
        part = current[:bracket_pos]

        # Add the part (it's a subtype name)
        if part:
            subtypes.append(part)

        # Find the matching closing bracket
        bracket_count = 0
        closing_pos = -1

        for i in range(bracket_pos, len(current)):
            if current[i] == "[":
                bracket_count += 1
            elif current[i] == "]":
                bracket_count -= 1
                if bracket_count == 0:
                    closing_pos = i
                    break

        if closing_pos == -1:
            break

        # Move to the content inside the brackets
        current = current[bracket_pos + 1 : closing_pos]

    return {"subtypes_list": subtypes, "filter_str": filter_str}


def serialize_dtype(
    dtype: Registry
    | SQLRecord
    | FieldAttr
    | list[SQLRecord]
    | list[Registry]
    | list[str]
    | list[float]
    | str
    | type,
    is_itype: bool = False,
) -> str:
    """Converts a data type object into its string representation."""
    from .record import Record
    from .ulabel import ULabel

    # Handle generic types like list[str], list[Registry], etc.
    if hasattr(dtype, "__origin__") and dtype.__origin__ is list:
        # Get the inner type from list[T]
        inner_type = dtype.__args__[0] if dtype.__args__ else None  # type: ignore
        if inner_type is not None:
            # Recursively serialize the inner type
            inner_dtype_str = serialize_dtype(inner_type, is_itype=is_itype)
            return f"list[{inner_dtype_str}]"

    if (
        not isinstance(dtype, list)
        and hasattr(dtype, "__name__")
        and dtype.__name__ in FEATURE_DTYPES
    ):
        dtype_str = dtype.__name__
    elif dtype is dict:
        dtype_str = "dict"
    elif is_itype and isinstance(dtype, str):
        if dtype not in "Feature":
            parse_cat_dtype(
                dtype_str=dtype, is_itype=True
            )  # throws an error if invalid
        dtype_str = dtype
    else:
        from pandas.core.dtypes.base import ExtensionDtype

        if isinstance(dtype, (ExtensionDtype, np.dtype)):
            dtype_str = serialize_pandas_dtype(dtype)
        else:
            error_message = "dtype has to be a registry, a ulabel subtype, a registry field, or a list of registries or fields, not {}"
            if isinstance(dtype, (Registry, DeferredAttribute, ULabel, Record)):
                dtype = [dtype]
            elif not isinstance(dtype, list):
                raise ValueError(error_message.format(dtype))
            dtype_str = ""
            for one_dtype in dtype:
                if not isinstance(
                    one_dtype, (Registry, DeferredAttribute, ULabel, Record)
                ):
                    raise ValueError(error_message.format(one_dtype))
                if isinstance(one_dtype, Registry):
                    dtype_str += one_dtype.__get_name_with_module__() + "|"
                elif isinstance(one_dtype, (ULabel, Record)):
                    if one_dtype._state.adding:
                        raise InvalidArgument(
                            f"Cannot serialize unsaved objects. Save {one_dtype} via `.save()`."
                        )
                    if not one_dtype.is_type:
                        raise InvalidArgument(
                            f"Cannot serialize non-type {one_dtype.__class__.__name__} '{one_dtype.name}'. Only types (is_type=True) are allowed in dtypes."
                        )
                    # Use UID-based format: Record[uid] instead of Record[Parent[Child]]
                    nested_string = f"[{one_dtype.uid}]"
                    if isinstance(one_dtype, ULabel):
                        dtype_str += f"ULabel{nested_string}"
                    else:
                        dtype_str += f"Record{nested_string}"
                else:
                    name = one_dtype.field.name
                    field_ext = f".{name}" if name != "name" else ""
                    dtype_str += (
                        one_dtype.field.model.__get_name_with_module__()
                        + field_ext
                        + "|"
                    )
            dtype_str = dtype_str.rstrip("|")
            if not is_itype:
                dtype_str = f"cat[{dtype_str}]"
    return dtype_str


def serialize_pandas_dtype(pandas_dtype: ExtensionDtype) -> str:
    """Convert pandas ExtensionDtype to simplified string representation."""
    from pandas.api.types import CategoricalDtype, is_string_dtype

    if is_string_dtype(pandas_dtype):
        if not isinstance(pandas_dtype, CategoricalDtype):
            dtype = "str"
        else:
            dtype = "cat[ULabel]"
    # there are string-like categoricals and "pure" categoricals (pd.Categorical)
    elif isinstance(pandas_dtype, CategoricalDtype):
        dtype = "cat[ULabel]"
    else:
        # strip precision qualifiers
        dtype = "".join(dt for dt in pandas_dtype.name if not dt.isdigit())
        if dtype == "uint":
            dtype = "int"
    if dtype.startswith("datetime"):
        dtype = dtype.split("[")[0]
    if dtype != "cat[ULabel]":
        assert dtype in FEATURE_DTYPES  # noqa: S101
    return dtype


def convert_to_pandas_dtype(lamin_dtype: str) -> str | pd.CategoricalDtype:
    """Convert LaminDB simplified string representation back to pandas dtype."""
    from pandas.api.types import CategoricalDtype

    dtype_map = {
        "str": "string",  # nullable string dtype
        "url": "string",  # URLs are validated as strings
        "int": "Int64",  # Nullable integer to handle missing values
        "num": "float64",
        "float": "float64",
        "bool": "boolean",  # Nullable boolean
        "datetime": "datetime64[ns]",
        "date": "object",  # preserve Date objects
        "dict": "object",  # dicts are stored as object dtype in pandas
    }
    if lamin_dtype in dtype_map:
        return dtype_map[lamin_dtype]
    elif lamin_dtype.startswith("cat"):
        return CategoricalDtype()
    elif lamin_dtype.startswith("list"):
        return "object"  # lists are stored as object dtype in pandas
    return lamin_dtype


def parse_filter_string(filter_str: str) -> dict[str, tuple[str, str | None, str]]:
    """Parse comma-separated Django filter expressions into structured components.

    Args:
        filter_str: Comma-separated filters like 'name=value, relation__field=value'

    Returns:
        Dict mapping original filter key to (relation_name, field_name, value) tuple.
        For direct fields: field_name is None.
        For relations: field_name contains the lookup field.
    """
    filters = {}

    filter_parts = [part.strip() for part in filter_str.split(",")]
    for part in filter_parts:
        if "=" not in part:
            raise ValueError(f"Invalid filter expression: '{part}' (missing '=' sign)")

        key, value = part.split("=", 1)
        key = key.strip()
        value = value.strip().strip("'\"")

        if not key:
            raise ValueError(f"Invalid filter expression: '{part}' (empty key)")
        if not value:
            raise ValueError(f"Invalid filter expression: '{part}' (empty value)")

        if "__" in key:
            relation_name, field_name = key.split("__", 1)
            filters[key] = (relation_name, field_name, value)
        else:
            filters[key] = (key, None, value)

    return filters


def resolve_relation_filters(
    parsed_filters: dict[str, tuple[str, str | None, str]], registry: SQLRecord
) -> dict[str, str | SQLRecord]:
    """Resolve relation filters actual model objects.

    Args:
        parsed_filters: Django filters like output from :func:`lamindb.models.feature.parse_filter_string`
        registry: Model class to resolve relationships against

    Returns:
        Dict with resolved objects for successful relations, original values for direct fields and failed resolutions.
    """
    resolved = {}
    for filter_key, (relation_name, field_name, value) in parsed_filters.items():
        if field_name is not None:  # relation filter
            if hasattr(registry, relation_name):
                relation_field = getattr(registry, relation_name)
                if (
                    hasattr(relation_field, "field")
                    and relation_field.field.is_relation
                ):
                    related_model = relation_field.field.related_model
                    related_obj = related_model.get(**{field_name: value})
                    resolved[relation_name] = related_obj
        else:
            resolved[filter_key] = value
    return resolved


def migrate_dtype_to_uid_format(connection, input_field: str = "_dtype_str") -> None:
    """Update _dtype_str for nested Record/ULabel types to uid format.

    Converts old format (name-based) dtype strings to new UID-based format.
    This function is used in migrations to update existing feature records.

    Args:
        connection: Database connection (from schema_editor.connection)
        input_field: Field name to read from ("_dtype_str" or "dtype")

    Returns:
        None. Updates are performed directly in the database.
    """
    # Patterns to look for old format (name-based)
    patterns = [
        "cat[Record[",
        "cat[ULabel[",
        "list[cat[Record[",
        "list[cat[ULabel[",
    ]

    # Build SQL query to fetch features matching any pattern
    # Using OR conditions for each pattern
    pattern_conditions = " OR ".join(
        [f"{input_field} LIKE '{pattern}%'" for pattern in patterns]
    )

    query = f"""
        SELECT id, uid, name, {input_field}
        FROM lamindb_feature
        WHERE {pattern_conditions}
    """

    # Fetch matching features
    with connection.cursor() as cursor:
        cursor.execute(query)
        columns = [col[0] for col in cursor.description]
        features = [dict(zip(columns, row)) for row in cursor.fetchall()]

    # Convert each feature
    for feature in features:
        try:
            # Convert old format string to objects, then serialize to UID format
            dtype_objects = dtype_as_object(feature[input_field], old_format=True)
            new_dtype_str = serialize_dtype(dtype_objects)

            if new_dtype_str != feature[input_field]:
                # Update using raw SQL
                update_query = """
                    UPDATE lamindb_feature
                    SET _dtype_str = %s
                    WHERE id = %s
                """
                with connection.cursor() as cursor:
                    cursor.execute(update_query, [new_dtype_str, feature["id"]])

        except Exception as e:
            # If conversion fails, keep the original value
            print(
                f"Warning: Could not convert dtype for feature {feature['name']} ({feature['uid']}) because of error: {e}"
            )
            continue


def process_init_feature_param(args, kwargs):
    # now we proceed with the user-facing constructor
    if len(args) != 0:
        raise ValueError("Only keyword args allowed")
    name: str = kwargs.pop("name", None)
    dtype: type | str | None = kwargs.pop("dtype", None)
    is_type: bool = kwargs.pop("is_type", False)
    type_: Feature | str | None = kwargs.pop("type", None)
    description: str | None = kwargs.pop("description", None)
    branch = kwargs.pop("branch", None)
    branch_id = kwargs.pop("branch_id", 1)
    space = kwargs.pop("space", None)
    space_id = kwargs.pop("space_id", 1)
    _skip_validation = kwargs.pop("_skip_validation", False)
    if kwargs:
        valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Feature)])
        raise FieldValidationError(f"Only {valid_keywords} are valid keyword arguments")
    kwargs["name"] = name
    kwargs["type"] = type_
    kwargs["is_type"] = is_type
    kwargs["branch"] = branch
    kwargs["branch_id"] = branch_id
    kwargs["space"] = space
    kwargs["space_id"] = space_id
    kwargs["_skip_validation"] = _skip_validation
    kwargs["description"] = description
    # cast dtype
    if dtype is None and not is_type:
        raise ValidationError(
            f"Please pass dtype, one of {FEATURE_DTYPES} or a composed categorical dtype"
        )
    dtype_str = None
    if dtype is not None:
        if not isinstance(dtype, str):
            dtype_str = serialize_dtype(dtype)
        elif dtype in {"num", "path", "url"}:
            dtype_str = dtype
        else:
            logger.warning(
                f"rather than passing a string '{dtype}' to dtype, consider passing a Python object"
            )
            dtype_str = dtype
            parse_dtype(dtype_str, check_exists=True, old_format=True)
            if dtype_str.startswith(
                ("cat[Record[", "cat[ULabel[", "list[cat[Record[", "list[cat[ULabel[")
            ):
                # need to convert from old semantic format to new uid-based format
                dtype_str = serialize_dtype(dtype_as_object(dtype_str, old_format=True))
        kwargs["_dtype_str"] = dtype_str
    return kwargs


UPDATE_FEATURE_ON_NAME_CHANGE = """\
DECLARE
    old_renamed JSONB;
    new_renamed JSONB;
    ts TEXT;
BEGIN
    -- Only proceed if name actually changed
    IF OLD.name IS DISTINCT FROM NEW.name THEN
        -- Update synonyms
        IF NEW.synonyms IS NULL OR NEW.synonyms = '' THEN
            NEW.synonyms := OLD.name;
        ELSIF position(OLD.name in NEW.synonyms) = 0 THEN
            NEW.synonyms := NEW.synonyms || '|' || OLD.name;
        END IF;

        -- Update _aux with rename history
        ts := TO_CHAR(NOW() AT TIME ZONE 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS"Z"');

        -- Get existing renamed history or initialize empty object
        old_renamed := COALESCE((OLD._aux->>'renamed')::JSONB, '{}'::JSONB);

        -- Add old name with timestamp
        new_renamed := old_renamed || jsonb_build_object(ts, OLD.name);

        -- Update _aux with new renamed history
        IF NEW._aux IS NULL THEN
            NEW._aux := jsonb_build_object('renamed', new_renamed);
        ELSE
            NEW._aux := NEW._aux || jsonb_build_object('renamed', new_renamed);
        END IF;
    END IF;

    RETURN NEW;
END;
"""


class Feature(SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates):
    """Measurable properties such as dataframe columns or record fields.

    Features represent *what* is measured in a dataset—the variables or dimensions along which data is organized.
    They enable you to query datasets based on their structure and corresponding label annotations.

    Args:
        name: `str` Name of the feature, typically a column name.
        dtype: `type | ULabel | Record | DtypeStr | Registry | list[Registry] | FieldAttr`
            Types or `ULabel` or `Record` objects representing types.
            See :class:`~lamindb.base.types.DtypeStr`.
        type: `Feature | None = None` A feature type, see :attr:`~lamindb.Feature.type`.
        is_type: `bool = False` Whether this feature is a type, see :attr:`~lamindb.Feature.is_type`.
        unit: `str | None = None` Unit of measure, ideally SI (`"m"`, `"s"`, `"kg"`, etc.) or `"normalized"` etc.
        description: `str | None = None` A description.
        synonyms: `str | None = None` Bar-separated synonyms.
        nullable: `bool = True` Whether the feature can have null-like values (`None`, `pd.NA`, `NaN`, etc.), see :attr:`~lamindb.Feature.nullable`.
        default_value: `Any | None = None` Default value for the feature.
        coerce: `bool | None = None` When `True`, attempts to coerce values to the specified dtype during validation, see :attr:`~lamindb.Feature.coerce`.
            Defaults to `False` unless `is_type` is `True`.
        cat_filters: `dict[str, str | SQLRecord] | None = None` Subset a registry by additional filters to define valid categories.

    Note:

        For more control, you can use :mod:`bionty` registries to manage simple
        biological entities like genes, proteins & cell markers. Or you define
        custom registries to manage high-level derived features like gene sets.

    See Also:
        :meth:`~lamindb.Feature.from_dataframe`
            Create feature records from DataFrame.
        :attr:`~lamindb.Artifact.features`
            Feature manager of an artifact or collection.
        :class:`~lamindb.ULabel`
            Universal labels.
        :class:`~lamindb.Schema`
            Sets of features.

    Example:

        Features with simple data types::

            ln.Feature(name="sample_note", dtype=str).save()
            ln.Feature(name="temperature_in_celsius", dtype=float).save()
            ln.Feature(name="read_count", dtype=int).save()

        A categorical feature measuring labels managed in the `ULabel` registry::

            ln.Feature(name="sample", dtype=ln.ULabel).save()

        Restrict a categorical feature to a specific `ULabel` type::

            perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
            ln.Feature(name="perturbation", dtype=perturbation).save()

        Restrict a categorical feature to a specific `Record` type::

            experiment = ln.Record(name="Experiment", is_type=True).save()
            ln.Feature(name="experiment", dtype=experiment).save()

        Restrict a categorical feature to the `bt.CellType` registry::

            ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save()  # expert annotation
            ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save()   # model annotation

        .. admonition:: Categoricals define relationships.

            In LaminDB, **categoricals** define **relationships**.
            For example, with dtype set to a `ULabel` type, setting a feature value relates the object to a `ULabel` of that type.

        Scope a feature with a **feature type** to distinguish the same feature name across different contexts::

            abc_feature_type = ln.Feature(name="ABC", is_type=True).save()  # ABC could reference a schema, a project, a team, etc.
            ln.Feature(name="concentration_nM", dtype=float, type=abc_feature_type).save()

            xyz_feature_type = ln.Feature(name="XYZ", is_type=True).save()  # XYZ could reference a schema, a project, a team, etc.
            ln.Feature(name="concentration_nM", dtype=float, type=xyz_feature_type).save()

            # calling .save() again with the same name and type returns the existing feature
            ln.Feature(name="concentration_nM", dtype=float, type=xyz_feature_type).save()

        Annotate an artifact with features (works identically for records and runs)::

            artifact.features.set_values({
                "temperature_in_celsius": 37.5,
                "sample_note": "Control sample",
            })

        Query artifacts/records/runs by features::

            ln.Artifact.filter(features__name="temperature_in_celsius")  # artifacts with this feature
            ln.Artifact.filter(temperature_in_celsius__gt=37)            # artifacts where temperature > 37

        Disambiguate duplicate feature names by querying with a `Feature` object::

            feature = ln.Feature.get(name="my_ambig_name", type__name="my_feature_type")
            ln.Artifact.filter(feature == "hello")  # instead of my_ambig_name="hello"

        A list dtype::

            ln.Feature(
                name="cell_types",
                dtype=list[bt.CellType],  # or list[str] for a list of strings
            ).save()

        A path feature::

            ln.Feature(
                name="image_path",
                dtype="path",   # will be validated as `str`
            ).save()

        Restrict categories via filters::

            # restrict diseases to those matching a specific ontology version
            source = bt.Source.get(name="My ontology")  # a registry for ontology versions
            ln.Feature(
                name="disease",
                dtype=bt.Disease,
                cat_filters={"source": source},
            ).save()

            # restrict artifacts to those matching a specific schema
            schema = ln.Schema.get(name="my-schema")
            ln.Feature(
                name="valid_artifact",
                dtype=ln.Artifact,
                cat_filters={"schema": schema},
            ).save()

        A feature accepting multiple categorical types - a union type::

            ln.Feature(
                name="cell_types",
                dtype="cat[bionty.Tissue.ontology_id|bionty.CellType.ontology_id]"
            ).save()

    .. dropdown:: What is the difference between features and labels?

        1. A feature qualifies what is measured, i.e., a numerical or categorical random variable
        2. A label *is* a measured value of a categorical variable, i.e., a category

        Example: When annotating a dataset that measures expression of 30k genes,
        the gene identifiers serve as feature identifiers, and the features are expression measurements for these genes.
        When annotating a dataset whose experiment knocked out 3 specific genes, those genes serve as labels of the dataset.

        Re-shaping data can introduce ambiguity among features & labels. If this
        happened, ask yourself what the joint measurement was: a feature
        qualifies variables in a joint measurement. The canonical data matrix
        lists jointly measured variables in the columns.
    """

    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):
        abstract = False
        app_label = "lamindb"
        if (
            django_settings.DATABASES.get("default", {}).get("ENGINE")
            == "django.db.backends.postgresql"
        ):
            triggers = [
                pgtrigger.Trigger(
                    name="update_feature_on_name_change",
                    operation=pgtrigger.Update,
                    when=pgtrigger.Before,
                    condition=pgtrigger.Condition("OLD.name IS DISTINCT FROM NEW.name"),
                    func=UPDATE_FEATURE_ON_NAME_CHANGE,
                ),
            ]
        constraints = [
            models.CheckConstraint(
                condition=models.Q(is_type=True) | models.Q(_dtype_str__isnull=False),
                name="feature_dtype_str_not_null_when_is_type_false",
            ),
            # also see raw SQL constraints for `is_type` and `type` FK validity in migrations
        ]

    # Keep Django model hash/equality semantics for model identity use-cases.
    __hash__ = SQLRecord.__hash__

    _name_field: str = "name"

    id: int = models.AutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(
        editable=False, unique=True, db_index=True, max_length=12, default=base62_12
    )
    """Universal id, valid across DB instances."""
    name: str = CharField(max_length=150, db_index=True)
    """Name of feature."""
    _dtype_str: DtypeStr | str | None = CharField(db_index=True, null=True)
    """The string-serialized data type (:class:`~lamindb.base.types.DtypeStr`).

    Note that mutating this field currently does not trigger re-validation of existing values.
    """
    type: Feature | None = ForeignKey(
        "self", PROTECT, null=True, related_name="features"
    )
    """Type of feature (e.g., 'Readout', 'Metric', 'Metadata', 'ExpertAnnotation', 'ModelPrediction').

    Allows to group features by type, e.g., all read outs, all metrics, etc.
    """
    features: Feature
    """Features of this type (can only be non-empty if `is_type` is `True`)."""
    unit: str | None = CharField(max_length=30, db_index=True, null=True)
    """Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional)."""
    description: str | None = TextField(null=True)
    """A description."""
    array_rank: int = models.SmallIntegerField(default=0, db_index=True)
    """Rank of feature.

    Number of indices of the array: 0 for scalar, 1 for vector, 2 for matrix.

    Is called `.ndim` in `numpy` and `pytorch` but shouldn't be confused with
    the dimension of the feature space.
    """
    array_size: int = models.IntegerField(default=0, db_index=True)
    """Number of elements of the feature.

    Total number of elements (product of shape components) of the array.

    - A number or string (a scalar): 1 or `None`
    - A 50-dimensional embedding: 50
    - A 25 x 25 image: 625
    """
    array_shape: list[int] | None = JSONField(default=None, db_default=None, null=True)
    """Shape of the feature.

    - A number or string (a scalar): [1] or `None`
    - A 50-dimensional embedding: [50]
    - A 25 x 25 image: [25, 25]

    Is stored as a list rather than a tuple because it's serialized as JSON.
    """
    synonyms: str | None = TextField(null=True)
    """Bar-separated (|) synonyms (optional)."""
    default_value: Any | None = JSONField(null=True, default=None)
    """A default value that overwrites missing values during standardization."""
    nullable: bool | None = BooleanField(null=True, default=None)
    """Whether the feature can have nullable values. None for type-like features."""
    coerce: bool | None = BooleanField(null=True, default=None)
    """Whether dtypes should be coerced during validation. None for type-like features."""
    # we define the below ManyToMany on the Feature model because it parallels
    # how other registries (like Gene, Protein, etc.) relate to Schema
    schemas: RelatedManager[Schema] = models.ManyToManyField(
        "Schema", through="SchemaFeature", related_name="features"
    )
    """Schemas linked to this feature."""
    values: RelatedManager[JsonValue]
    """Values for this feature."""
    projects: RelatedManager[Project]
    """Annotating projects."""
    ablocks: RelatedManager[FeatureBlock]
    """Attached blocks ← :attr:`~lamindb.FeatureBlock.feature`."""

    @overload
    def __init__(
        self,
        name: str,
        dtype: DtypeStr | ULabel | Record | Registry | list[Registry] | FieldAttr,
        type: Feature | None = None,
        is_type: bool = False,
        unit: str | None = None,
        description: str | None = None,
        synonyms: str | None = None,
        nullable: bool | None = None,
        default_value: Any | None = None,
        coerce: bool | None = None,
        cat_filters: dict[str, str] | None = None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        if len(args) == len(self._meta.concrete_fields):
            super().__init__(*args, **kwargs)
            return None
        default_value = kwargs.pop("default_value", None)
        nullable = kwargs.pop("nullable", None)
        # Default nullable to True for non-type features
        is_type = kwargs.get("is_type", False)
        if nullable is None and not is_type:
            nullable = True
        cat_filters = kwargs.pop("cat_filters", None)
        if "coerce_dtype" in kwargs:
            warnings.warn(
                "`coerce_dtype` argument was renamed to `coerce` and will be removed in a future release.",
                DeprecationWarning,
                stacklevel=2,
            )
            coerce = kwargs.pop("coerce_dtype")
        else:
            coerce = kwargs.pop("coerce", None)
        kwargs = process_init_feature_param(args, kwargs)
        super().__init__(*args, **kwargs)
        self.default_value = default_value
        self.nullable = nullable
        self.coerce = coerce
        dtype_str = kwargs.pop("_dtype_str", None)
        if dtype_str == "cat":
            warnings.warn(
                "dtype `cat` is deprecated and will be removed in the future - "
                "please use `ln.Record` or `ln.ULabel` instead",
                DeprecationWarning,
                stacklevel=2,
            )
        if cat_filters:
            if "|" in dtype_str:
                raise ValidationError(
                    f"cat_filters are incompatible with union dtypes: '{dtype_str}'"
                )
            if "]]" in dtype_str:
                raise ValidationError(
                    f"cat_filters are incompatible with nested dtypes: '{dtype_str}'"
                )

            # Validate filter values and SQLRecord attributes
            for filter_key, filter_value in cat_filters.items():
                if not filter_value or (
                    isinstance(filter_value, str) and not filter_value.strip()
                ):
                    raise ValidationError(f"Empty value in filter {filter_key}")
                # Check SQLRecord attributes for relation lookups
                if isinstance(filter_value, SQLRecord) and "__" in filter_key:
                    field_name = filter_key.split("__", 1)[1]
                    if not hasattr(filter_value, field_name):
                        raise ValidationError(
                            f"SQLRecord {filter_value.__class__.__name__} has no attribute '{field_name}' in filter {filter_key}"
                        )

            # If a SQLRecord is passed, we access its uid to apply a standard filter
            cat_filters = {
                f"{key}__uid"
                if (
                    is_sqlrecord := isinstance(filter, SQLRecord)
                    and hasattr(filter, "uid")
                )
                else key: filter.uid if is_sqlrecord else filter
                for key, filter in cat_filters.items()
            }

            fill_in = ", ".join(
                f"{key}='{value}'" for (key, value) in cat_filters.items()
            )
            dtype_str = dtype_str.replace("]", f"[{fill_in}]]")
            self._dtype_str = dtype_str
        if not self._state.adding:
            if self._dtype_str != dtype_str:
                raise ValidationError(
                    f"Feature {self.name} already exists with dtype {self._dtype_str}, you passed {dtype_str}"
                )

    def __eq__(self, other: object) -> bool:
        # Preserve model identity semantics only for Feature-to-Feature comparisons.
        if isinstance(other, Feature):
            return super().__eq__(other)
        # Runtime returns a predicate object for query composition.
        # Cast keeps mypy-compatible override with object.__eq__ -> bool.
        return cast(bool, FeaturePredicate(self, "", other))

    def __ne__(self, other: object) -> bool:
        # Preserve model identity semantics only for Feature-to-Feature comparisons.
        if isinstance(other, Feature):
            return not super().__eq__(other)
        # Runtime returns a predicate object for query composition.
        # Cast keeps mypy-compatible override with object.__ne__ -> bool.
        return cast(bool, FeaturePredicate(self, "__ne", other))

    def __gt__(self, value: Any) -> FeaturePredicate:
        return FeaturePredicate(self, "__gt", value)

    def __ge__(self, value: Any) -> FeaturePredicate:
        return FeaturePredicate(self, "__gte", value)

    def __lt__(self, value: Any) -> FeaturePredicate:
        return FeaturePredicate(self, "__lt", value)

    def __le__(self, value: Any) -> FeaturePredicate:
        return FeaturePredicate(self, "__lte", value)

    # manually sync this docstring across all other children of HasType
    def query_features(self) -> QuerySet:
        """Query features of sub types.

        While `.features` retrieves the features with the current type, this method
        also retrieves sub types and the features with sub types of the current type.
        """
        return _query_relatives([self], "features")  # type: ignore

    @classmethod
    def from_dataframe(
        cls, df: pd.DataFrame, field: FieldAttr | None = None, *, mute: bool = False
    ) -> SQLRecordList:
        """Create Feature records for dataframe columns.

        Args:
            df: Source DataFrame to extract column information from
            field: FieldAttr for Feature model validation, defaults to Feature.name
            mute: Whether to mute Feature creation similar names found warnings
        """
        from lamindb.models import ULabel

        field = Feature.name if field is None else field
        registry = field.field.model  # type: ignore
        if registry != Feature:
            raise ValueError("field must be a Feature FieldAttr!")

        categoricals = categoricals_from_df(df)
        dtypes: dict[str, type | SQLRecord | FieldAttr] = {}
        for name, col in df.items():
            if name in categoricals:
                dtypes[name] = ULabel
            else:
                dtype_str = serialize_pandas_dtype(col.dtype)
                dtypes[name] = dtype_as_object(dtype_str)

        if mute:
            original_verbosity = logger._verbosity
            logger.set_verbosity(0)
        try:
            features = [
                Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()
            ]  # type: ignore
            assert len(features) == len(df.columns)  # noqa: S101
            return SQLRecordList(features)
        finally:
            if mute:
                logger.set_verbosity(original_verbosity)

    @classmethod
    @deprecated("from_dataframe")
    def from_df(
        cls, df: pd.DataFrame, field: FieldAttr | None = None, *, mute: bool = False
    ) -> SQLRecordList:
        return cls.from_dataframe(df, field, mute=mute)

    @classmethod
    def from_dict(
        cls,
        dictionary: dict[str, Any],
        field: FieldAttr | None = None,
        *,
        type: Feature | None = None,
        mute: bool = False,
    ) -> SQLRecordList:
        """Create Feature records for dictionary keys.

        Args:
            dictionary: Source dictionary to extract key information from
            field: FieldAttr for Feature model validation, defaults to `Feature.name`
            type: Feature type of all created features
            mute: Whether to mute dtype inference and feature creation warnings
        """
        from lamindb.models._feature_manager import infer_convert_dtype_key_value

        field = Feature.name if field is None else field
        registry = field.field.model  # type: ignore
        if registry != Feature:
            raise ValueError("field must be a Feature FieldAttr!")

        dtypes = {}
        for key, value in dictionary.items():
            dtype, _, message = infer_convert_dtype_key_value(key, value, mute=mute)
            if dtype == "cat ? str":
                dtype = "str"
            elif dtype == "list[cat ? str]":
                dtype = "list[str]"
            dtypes[key] = dtype

        if mute:
            original_verbosity = logger._verbosity
            logger.set_verbosity(0)
        try:
            features = [
                Feature(name=key, dtype=dtype, type=type)
                for key, dtype in dtypes.items()
            ]  # type: ignore
            assert len(features) == len(dictionary)  # noqa: S101
            return SQLRecordList(features)
        finally:
            if mute:
                logger.set_verbosity(original_verbosity)

    def save(self, *args, **kwargs) -> Feature:
        """Save the feature to the instance."""
        super().save(*args, **kwargs)
        return self

    def with_config(self, optional: bool | None = None) -> tuple[Feature, dict]:
        """Pass addtional configurations to the schema."""
        if optional is not None:
            return self, {"optional": optional}
        return self, {}

    @property
    @deprecated("coerce")
    def coerce_dtype(self) -> bool | None:
        """Alias for coerce (backward compatibility)."""
        return self.coerce

    @coerce_dtype.setter
    def coerce_dtype(self, value: bool | None) -> None:
        self.coerce = value

    @property
    @deprecated("dtype_as_str")
    def dtype(self) -> str | None:
        """The `dtype` as a string."""
        if self._dtype_str is None:
            return None
        if self._dtype_str.startswith(
            ("cat[Record[", "cat[ULabel[", "list[cat[Record[", "list[cat[ULabel[")
        ):
            if self._dtype_str.startswith("list["):
                dtype_str = self._dtype_str.replace("list[", "")[:-1]
            else:
                dtype_str = self._dtype_str
            record_object = dtype_as_object(dtype_str)
            nested_string = f"[{record_object.name}]"  # type: ignore
            for t in record_object.query_types():  # type: ignore
                nested_string = f"[{t.name}{nested_string}]"
            return self._dtype_str.replace(f"[{record_object.uid}]", nested_string)  # type: ignore
        else:
            return self._dtype_str

    @property
    def dtype_as_str(self) -> DtypeStr | str | None:
        """The `dtype` as a string.

        You can query by this property as if it was a string field. The query is delegated to the private `_dtype_str` field.

        Is `None` if `Feature` if `is_type=True`, otherwise a string.

        Examples:

            Query by `dtype_as_str`::

                ln.Feature.filter(dtype_as_str="float").to_dataframe()

            Examples for `dtype_as_str`::

                feature_float = ln.Feature(name="measurement", dtype=float).save()
                assert feature_float.dtype_as_str == "float"

                sample_type = bt.Record(name="Sample", is_type=True).save()
                feature_sample = ln.Feature(name="sample", dtype=sample_type).save()
                assert feature_sample.dtype_as_str == "cat[Record[12345678abcdeFGHI]]  # uid of type record

                feature_list_float = ln.Feature(name="numbers", dtype=list[float]).save()
                assert feature_list_float.dtype_as_str == "list[float]"

                feature_ulabel = ln.Feature(name="sample", dtype=ln.ULabel).save()
                assert feature_ulabel.dtype_as_str == "cat[ULabel]"

                feature_record = ln.Feature(name="sample", dtype=bt.CellLine).save()
                assert feature_record.dtype_as_str == "cat[bionty.CellLine]"

                feature_list_record = ln.Feature(name="cell_types", dtype=list[bt.CellLine]).save()
                assert feature_list_record.dtype_as_str == "list[cat[bionty.CellLine]]"
        """
        return self._dtype_str

    @property
    def dtype_as_object(self) -> type | SQLRecord | FieldAttr | None:  # type: ignore
        """The `dtype` as an object.

        Example:

            For simple dtypes, returns the built-in Python type::

                feature_float = ln.Feature(name="measurement", dtype=float).save()
                assert feature_float.dtype_as_object is float

            For features with with `Record` or `ULabel` types, returns the `Record` or `ULabel` object::

                sample_type = bt.Record(name="Sample", is_type=True).save()
                feature_sample = ln.Feature(name="sample", dtype=sample_type).save()
                assert feature_sample.dtype_as_object == sample_type

            For features with `Registry` types, returns the `Registry` object or a field (`DeferredAttribute`) object::

                feature_cell_type = ln.Feature(name="cell_type_name", dtype=bt.CellType).save()
                assert feature_cell_type.dtype_as_object == bt.CellType
                feature_ontology_id = ln.Feature(name="ontology_id", dtype=bt.CellType.ontology_id).save()
                assert feature_ontology_id.dtype_as_object == bt.CellType.ontology_id

        """
        return dtype_as_object(self._dtype_str)

    # we'll enable this later
    # @property
    # def observational_unit(self) -> Literal["Artifact", "Observation"]:
    #     """Default observational unit on which the feature is measured.

    #     Currently, we only make a distinction between artifact-level and observation-level features.

    #     For example, a feature `"ml_split"` that stores `"test"` & `"train"` labels is typically defined on the artifact level.
    #     When accessing `artifact.features.get_values(["ml_split"])`, you expect a single value, either `"test"` or `"train"`.

    #     However, when accessing an artifact annotation with a feature that's defined on the observation-level, say `"cell_type"`, you expect a set of values. So,
    #     `artifact.features.get_values(["cell_type_from_expert"])` should return a set: `{"T cell", "B cell"}`.

    #     The value of `observational_unit` is currently auto-managed: if using `artifact.features.set_values()`,
    #     it will be set to `Artifact`. In a curator, the value depends on whether it's an artifact- or observation-level slot
    #     (e.g. `.uns` is artifact-level in `AnnData` whereas `.obs` is observation-level).

    #     Note: This attribute might in the future be used to distinguish different types of observational units (e.g. single cells vs. physical samples vs. study subjects etc.).
    #     """
    #     if self._expect_many:
    #         return "Observation"  # this here might be replaced with the specific observational unit
    #     else:
    #         return "Artifact"


class JsonValue(SQLRecord, TracksRun):
    """JSON values for annotating artifacts and runs.

    Categorical values are stored in their respective registries:
    :class:`~lamindb.ULabel`, :class:`~bionty.CellType`, etc.

    Unlike for `ULabel`, in `JsonValue`, values are grouped by features and
    not by an ontological hierarchy.
    """

    # we do not have a unique constraint on feature & value because it leads to hashing errors
    # for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000
    # we do not hash values because we have `get_or_create` logic all over the place
    # and also for checking whether the (feature, value) combination exists
    # there does not seem an issue with querying for a dict-like value
    # https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001

    _name_field: str = "value"

    feature: Feature | None = ForeignKey(
        Feature, CASCADE, null=True, related_name="values", default=None
    )
    """The dimension metadata."""
    value: Any = models.JSONField()
    """The JSON-like value."""
    hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True)
    """Value hash."""
    artifacts: Artifact
    """Artifacts annotated with this feature value."""
    runs: Run
    """Runs annotated with this feature value."""

    class Meta(BaseSQLRecord.Meta, TracksRun.Meta):
        app_label = "lamindb"
        unique_together = ("feature", "hash")

    @classmethod
    def get_or_create(cls, feature, value):
        # simple values: (int, float, str, bool, datetime)
        if not isinstance(value, dict):
            hash = hash_string(str(value))
        else:
            hash = hash_dict(value)
        try:
            return (
                cls.objects.create(feature=feature, value=value, hash=hash),
                False,
            )
        except DjangoIntegrityError:
            return cls.objects.get(feature=feature, hash=hash), True


def suggest_categorical_for_str_iterable(
    iterable: Iterable[str], key: str = None
) -> str:
    import pandas as pd

    c = pd.Categorical(iterable)
    message = ""
    if len(c.categories) < len(c):
        if key != "":
            key_note = f" for feature {key}"
        else:
            key_note = ""
        message = f"You have few permissible values{key_note}, consider dtype 'cat' instead of 'str'"
    return message


def categoricals_from_df(df: pd.DataFrame) -> dict:
    """Returns categorical columns."""
    from pandas.api.types import CategoricalDtype, is_string_dtype

    string_cols = [col for col in df.columns if is_string_dtype(df[col])]
    categoricals = {
        col: df[col]
        for col in df.columns
        if isinstance(df[col].dtype, CategoricalDtype)
    }
    for key in string_cols:
        message = suggest_categorical_for_str_iterable(df[key], key)
        if message:
            logger.warning(message)
    return categoricals


================================================
FILE: lamindb/models/has_parents.py
================================================
# ruff: noqa: TC004
from __future__ import annotations

import builtins
from typing import TYPE_CHECKING, Literal

import lamindb_setup as ln_setup
from lamin_utils import logger

from ..errors import ValidationError
from .query_set import SQLRecordList, get_default_branch_ids
from .run import Run
from .sqlrecord import HasType, format_field_value, get_name_field

if TYPE_CHECKING:
    from graphviz import Digraph

    from lamindb.base.types import StrField

    from .artifact import Artifact
    from .collection import Collection
    from .query_set import BasicQuerySet, QuerySet
    from .sqlrecord import SQLRecord

LAMIN_GREEN_LIGHTER = "#10b981"
LAMIN_GREEN_DARKER = "#065f46"
TRANSFORM_VIOLET = "#eff2ff"
GREEN_FILL = "honeydew"
is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)


# this is optimized to have fewer recursive calls
# also len of QuerySet can be costly at times
def _query_relatives(
    records: BasicQuerySet | list[HasParents],
    attr: Literal["children", "parents"] | str,
) -> QuerySet:
    branch_ids = get_default_branch_ids()

    if hasattr(records, "values_list"):
        model = records.model  # type: ignore
        using_db = records.db  # type: ignore
        frontier_ids = set(records.values_list("id", flat=True))
    else:
        record = records[0]
        model = record.__class__
        using_db = record._state.db  # type: ignore
        frontier_ids = {r.id for r in records}  # type: ignore

    if attr == "children":
        attr_filter = "parents__id__in"
    elif attr == "parents":
        attr_filter = "children__id__in"
    else:
        attr_filter = "type__id__in"

    seen_ids = set(frontier_ids)  # copies
    results = set()

    while frontier_ids:
        relatives_qs = model.connect(using_db).filter(
            branch_id__in=branch_ids, **{attr_filter: frontier_ids}
        )
        next_ids = set(relatives_qs.values_list("id", flat=True)) - seen_ids
        if not next_ids:
            break
        results.update(next_ids)
        seen_ids.update(next_ids)
        frontier_ids = next_ids

    return model.connect(using_db).filter(id__in=results)


def keep_topmost_matches(records: list[HasType] | SQLRecordList) -> SQLRecordList:
    """Keep only the topmost (least specific) match."""
    if not records:
        return SQLRecordList([])

    # Group by name
    records_by_name: dict[str, list[HasType]] = {}
    for record in records:
        if record.name not in records_by_name:
            records_by_name[record.name] = []
        records_by_name[record.name].append(record)

    # Fast path: single match per name
    result: SQLRecordList = SQLRecordList([])
    needs_depth_computation = {}

    for name, name_records in records_by_name.items():
        if len(name_records) == 1:
            result.append(name_records[0])
        else:
            # Check if any have type_id=None (trivially topmost)
            root_records = [r for r in name_records if r.type_id is None]
            if len(root_records) == 1:
                result.append(root_records[0])
            elif len(root_records) > 1:
                class_name = records[0].__class__.__name__
                raise ValidationError(
                    f"Ambiguous match for {class_name} '{name}': found {len(root_records)} "
                    f"root-level {class_name.lower()}s"
                )
            else:
                # All have type_id, need depth computation
                needs_depth_computation[name] = name_records

    # Only compute depths if necessary
    if needs_depth_computation:

        def get_depth(record):
            current_type = record.type
            depth = 1
            while current_type.type_id is not None:
                current_type = current_type.type
                depth += 1
            return depth

        for name, name_records in needs_depth_computation.items():
            records_with_depth = [(r, get_depth(r)) for r in name_records]
            min_depth = min(depth for _, depth in records_with_depth)
            topmost = [r for r, depth in records_with_depth if depth == min_depth]
            class_name = records[0].__class__.__name__
            if len(topmost) > 1:
                raise ValidationError(
                    f"Ambiguous match for {class_name} '{name}': found {len(topmost)} {class_name.lower()}s "
                    f"at depth {min_depth} (under types: {[r.type.name for r in topmost]})"
                )

            result.append(topmost[0])

    return result


def _query_ancestors_of_fk(record: SQLRecord, attr: str) -> SQLRecordList:
    from .query_set import get_default_branch_ids

    branch_ids = get_default_branch_ids()
    ancestors = []

    current = getattr(record, attr)
    while current is not None and current.branch_id in branch_ids:
        ancestors.append(current)
        current = getattr(current, attr)

    return SQLRecordList(ancestors)


class HasParents:
    """Base class for hierarchical registries (ontologies)."""

    def view_parents(
        self,
        field: StrField | None = None,
        with_children: bool = False,
        distance: int = 5,
    ):
        """View parents in an ontology.

        Args:
            field: Field to display on graph
            with_children: Whether to also show children.
            distance: Maximum distance still shown.

        Ontological hierarchies: :class:`~lamindb.ULabel` (project & sub-project), :class:`~bionty.CellType` (cell type & subtype).

        Examples:
            >>> import bionty as bt
            >>> bt.Tissue.from_source(name="subsegmental bronchus").save()
            >>> record = bt.Tissue.get(name="respiratory tube")
            >>> record.view_parents()
            >>> tissue.view_parents(with_children=True)
        """
        if field is None:
            field = get_name_field(self)
        if not isinstance(field, str):
            field = field.field.name

        return view_parents(
            record=self,  # type: ignore
            field=field,
            with_parents=True,
            with_children=with_children,
            distance=distance,
        )

    def view_children(
        self,
        field: StrField | None = None,
        distance: int = 5,
    ):
        """View children in an ontology.

        Args:
            field: Field to display on graph
            distance: Maximum distance still shown.

        Ontological hierarchies: :class:`~lamindb.ULabel` (project & sub-project), :class:`~bionty.CellType` (cell type & subtype).

        Examples:
            >>> import bionty as bt
            >>> bt.Tissue.from_source(name="subsegmental bronchus").save()
            >>> record = bt.Tissue.get(name="respiratory tube")
            >>> record.view_parents()
            >>> tissue.view_parents(with_children=True)
        """
        if field is None:
            field = get_name_field(self)
        if not isinstance(field, str):
            field = field.field.name

        return view_parents(
            record=self,  # type: ignore
            field=field,
            with_parents=False,
            with_children=True,
            distance=distance,
        )

    def query_parents(self) -> QuerySet:
        """Query parents in an ontology."""
        return _query_relatives([self], "parents")  # type: ignore

    def query_children(self) -> QuerySet:
        """Query children in an ontology."""
        return _query_relatives([self], "children")  # type: ignore


def view_digraph(u: Digraph):
    from graphviz.backend import ExecutableNotFound

    try:
        if is_run_from_ipython:
            from IPython import get_ipython
            from IPython.display import display

            #  True if the code is running in a Jupyter Notebook or Lab environment
            if get_ipython().__class__.__name__ == "TerminalInteractiveShell":
                return u.view()
            else:
                # call u._repr_mimebundle_() manually that exception gets raised properly and not just printed by
                # call to display()
                display(u._repr_mimebundle_(), raw=True)
        else:
            return u.view()
    except (FileNotFoundError, RuntimeError, ExecutableNotFound):  # pragma: no cover
        logger.error(
            "please install the graphviz executable on your system:\n  - Ubuntu: `sudo"
            " apt-get install graphviz`\n  - Windows:"
            " https://graphviz.org/download/#windows\n  - Mac: `brew install graphviz`"
        )


def view_lineage(
    data: Artifact | Collection, with_children: bool = True, return_graph: bool = False
) -> Digraph | None:
    """View data lineage graph."""
    if ln_setup.settings.instance.is_on_hub:
        instance_slug = ln_setup.settings.instance.slug
        ui_url = ln_setup.settings.instance.ui_url
        entity_slug = data.__class__.__name__.lower()
        logger.important(
            f"explore at: {ui_url}/{instance_slug}/{entity_slug}/{data.uid}"
        )

    import graphviz

    df_values = _get_all_parent_runs(data)
    if with_children:
        df_values += _get_all_child_runs(data)
    df_edges = _df_edges_from_runs(df_values)

    def add_node(
        record: Run | Artifact | Collection,
        node_id: str,
        node_label: str,
        u: graphviz.Digraph,
    ):
        if isinstance(record, Run):
            fillcolor = TRANSFORM_VIOLET
        else:
            fillcolor = "white"
        u.node(
            node_id,
            label=node_label,
            shape="box",
            style="rounded,filled",
            fillcolor=fillcolor,
        )

    u = graphviz.Digraph(
        f"{data._meta.model_name}_{data.uid}",
        node_attr={
            "fillcolor": "white",
            "color": "darkgrey",
            "fontname": "Helvetica",
            "fontsize": "10",
        },
        edge_attr={"arrowsize": "0.5"},
    )

    for _, row in df_edges.iterrows():
        add_node(row["source_record"], row["source"], row["source_label"], u)
        if row["target_record"] not in df_edges["source_record"]:
            add_node(row["target_record"], row["target"], row["target_label"], u)

        u.edge(row["source"], row["target"], color="dimgrey")

    u.node(
        f"{data._meta.model_name}_{data.uid}",
        label=get_record_label(data),
        style="rounded,filled",
        fillcolor="white",
        shape="box",
    )

    if return_graph:
        return u
    else:
        return view_digraph(u)


def view_parents(
    record: SQLRecord,
    field: str,
    with_parents: bool = True,
    with_children: bool = False,
    distance: int = 100,
    attr_name: Literal["parents", "predecessors"] = "parents",
):
    """Graph of parents."""
    if not hasattr(record, attr_name):
        raise NotImplementedError(
            f"Parents view is not supported for {record.__class__.__name__}!"
        )
    import graphviz
    import pandas as pd

    df_edges = None
    df_edges_parents = None
    df_edges_children = None
    if with_parents:
        df_edges_parents = _df_edges_from_parents(
            record=record, field=field, distance=distance, attr_name=attr_name
        )
    if with_children:
        df_edges_children = _df_edges_from_parents(
            record=record,
            field=field,
            distance=distance,
            children=True,
            attr_name=attr_name,
        )
        # Rename the columns to swap source and target
        df_edges_children = df_edges_children.rename(
            columns={
                "source": "temp_target",
                "source_label": "temp_target_label",
                "source_record": "temp_target_record",
                "target": "source",
                "target_label": "source_label",
                "target_record": "source_record",
            }
        )
        df_edges_children = df_edges_children.rename(
            columns={
                "temp_target": "target",
                "temp_target_label": "target_label",
                "temp_target_record": "target_record",
            }
        )
    if df_edges_parents is not None and df_edges_children is not None:
        df_edges = pd.concat([df_edges_parents, df_edges_children]).drop_duplicates()
    elif df_edges_parents is not None:
        df_edges = df_edges_parents
    elif df_edges_children is not None:
        df_edges = df_edges_children
    else:
        return None

    u = graphviz.Digraph(
        record.uid,
        node_attr={
            "color": LAMIN_GREEN_DARKER,
            "fillcolor": GREEN_FILL,
            "shape": "box",
            "style": "rounded,filled",
            "fontname": "Helvetica",
            "fontsize": "10",
        },
        edge_attr={"arrowsize": "0.5"},
    )
    u.node(
        record.uid,
        label=(get_record_label(record)),
        fillcolor=LAMIN_GREEN_LIGHTER,
    )
    if df_edges is not None:
        for _, row in df_edges.iterrows():
            u.node(row["source"], label=row["source_label"])
            u.node(row["target"], label=row["target_label"])
            u.edge(row["source"], row["target"], color="dimgrey")

    view_digraph(u)


def _get_parents(
    record: SQLRecord,
    field: str,
    distance: int,
    children: bool = False,
    attr_name: Literal["parents", "predecessors"] = "parents",
):
    """Recursively get parent records within a distance."""
    if children:
        key = attr_name
    else:
        key = "children" if attr_name == "parents" else "successors"  # type: ignore

    using_db = record._state.db
    model = record.__class__
    condition = f"{key}__{field}"
    field_value = getattr(record, field)

    results = model.connect(using_db).filter(**{condition: field_value})
    if distance < 2:
        return results

    d = 2
    while d < distance:
        # this grows in the loop,
        # i.e. children__children__name -> children__children__children__name -> ...
        condition = f"{key}__{condition}"
        records = model.connect(using_db).filter(**{condition: field_value})

        try:
            if not records.exists():
                return results

            results = results | records
            d += 1
        except Exception:
            # For OperationalError:
            # SQLite does not support joins containing more than 64 tables
            return results
    return results


def _df_edges_from_parents(
    record: SQLRecord,
    field: str,
    distance: int,
    children: bool = False,
    attr_name: Literal["parents", "predecessors"] = "parents",
):
    """Construct a DataFrame of edges as the input of graphviz.Digraph."""
    if attr_name == "parents":
        key = "children" if children else "parents"
    else:
        key = "successors" if children else "predecessors"

    parents = _get_parents(
        record=record,
        field=field,
        distance=distance,
        children=children,
        attr_name=attr_name,
    )
    using_db = record._state.db
    all = record.__class__.objects.using(using_db)
    records = parents | all.filter(id=record.id)
    df = records.distinct().to_dataframe(include=[f"{key}__id"])
    if f"{key}__id" not in df.columns:
        return None
    df_edges = df[[f"{key}__id"]]
    df_edges = df_edges.explode(f"{key}__id")
    df_edges.index.name = "target"
    df_edges = df_edges.reset_index()
    df_edges.dropna(axis=0, inplace=True)
    df_edges.rename(columns={f"{key}__id": "source"}, inplace=True)
    df_edges = df_edges.drop_duplicates()

    # colons messes with the node formatting:
    # https://graphviz.readthedocs.io/en/stable/node_ports.html
    df_edges["source_record"] = df_edges["source"].apply(lambda x: all.get(id=x))
    df_edges["target_record"] = df_edges["target"].apply(lambda x: all.get(id=x))
    if record.__class__.__name__ == "Transform":
        df_edges["source_label"] = df_edges["source_record"].apply(get_record_label)
        df_edges["target_label"] = df_edges["target_record"].apply(get_record_label)
    else:
        df_edges["source_label"] = df_edges["source_record"].apply(
            lambda x: get_record_label(x, field)
        )
        df_edges["target_label"] = df_edges["target_record"].apply(
            lambda x: get_record_label(x, field)
        )
    df_edges["source"] = df_edges["source_record"].apply(lambda x: x.uid)
    df_edges["target"] = df_edges["target_record"].apply(lambda x: x.uid)
    return df_edges


def get_record_label(record: SQLRecord, field: str | None = None):
    from .artifact import Artifact
    from .collection import Collection
    from .transform import Transform

    if isinstance(record, (Artifact, Collection, Transform)):
        title = (
            record.key.replace("&", "&amp;") if record.key is not None else record.uid
        )
        return rf"<{title}>"
    elif isinstance(record, Run):
        title = record.transform.key.replace("&", "&amp;")
        if record.entrypoint is not None:
            title += f": {record.entrypoint}"
        return (
            rf'<{title}<BR/><FONT COLOR="GREY" POINT-SIZE="10">'
            rf"run at {format_field_value(record.started_at)}</FONT>>"
        )
    else:
        if field is None:
            field = get_name_field(record)
        title = record.__getattribute__(field)
        return rf"<{title}>"


def _get_all_parent_runs(data: Artifact | Collection) -> list:
    """Get all input file/collection runs recursively."""
    name = data._meta.model_name
    run_inputs_outputs = []

    runs = [data.run] if data.run is not None else []
    while len(runs) > 0:
        inputs = []
        for r in runs:
            inputs_run = (
                r.__getattribute__(f"input_{name}s")
                .all()
                .filter(branch_id__in=[0, 1])
                .to_list()
            )
            if name == "artifact":
                inputs_run += (
                    r.input_collections.all().filter(branch_id__in=[0, 1]).to_list()
                )
            outputs_run = (
                r.__getattribute__(f"output_{name}s")
                .all()
                .filter(branch_id__in=[0, 1])
                .to_list()
            )
            if name == "artifact":
                outputs_run += (
                    r.output_collections.all().filter(branch_id__in=[0, 1]).to_list()
                )
            # if inputs are outputs artifacts are the same, will result infinite loop
            # so only show as outputs
            overlap = set(inputs_run).intersection(outputs_run)
            if overlap:
                logger.warning(
                    f"The following artifacts are both inputs and outputs of Run(uid={r.uid}): {overlap}\n   → Only showing as outputs."
                )
                inputs_run = list(set(inputs_run) - overlap)
            if len(inputs_run) > 0:
                run_inputs_outputs += [(inputs_run, r)]
            if len(outputs_run) > 0:
                run_inputs_outputs += [(r, outputs_run)]
            inputs += inputs_run
        runs = [f.run for f in inputs if f.run is not None]
    return run_inputs_outputs


def _get_all_child_runs(data: Artifact | Collection) -> list:
    """Get all output file/collection runs recursively."""
    name = data._meta.model_name
    all_runs: set[Run] = set()
    run_inputs_outputs = []

    if data.run is not None:
        runs = {f.run for f in data.run.__getattribute__(f"output_{name}s").all()}
    else:
        runs = set()
    if name == "artifact" and data.run is not None:
        runs.update(
            {
                f.run
                for f in data.run.output_collections.all().filter(branch_id__in=[0, 1])
            }
        )
    while runs.difference(all_runs):
        all_runs.update(runs)
        child_runs: set[Run] = set()
        for r in runs:
            inputs_run = (
                r.__getattribute__(f"input_{name}s")
                .all()
                .filter(branch_id__in=[0, 1])
                .to_list()
            )
            if name == "artifact":
                inputs_run += (
                    r.input_collections.all().filter(branch_id__in=[0, 1]).to_list()
                )
            run_inputs_outputs += [(inputs_run, r)]

            outputs_run = (
                r.__getattribute__(f"output_{name}s")
                .all()
                .filter(branch_id__in=[0, 1])
                .to_list()
            )
            if name == "artifact":
                outputs_run += (
                    r.output_collections.all().filter(branch_id__in=[0, 1]).to_list()
                )
            run_inputs_outputs += [(r, outputs_run)]

            child_runs.update(
                Run.filter(  # type: ignore
                    **{f"input_{name}s__uid__in": [i.uid for i in outputs_run]}
                ).to_list()
            )
            # for artifacts, also include collections in the lineage
            if name == "artifact":
                child_runs.update(
                    Run.filter(  # type: ignore
                        input_collections__uid__in=[i.uid for i in outputs_run]
                    ).to_list()
                )
        runs = child_runs
    return run_inputs_outputs


def _df_edges_from_runs(df_values: list):
    import pandas as pd

    df = pd.DataFrame(df_values, columns=["source_record", "target_record"])
    df = df.explode("source_record")
    df = df.explode("target_record")
    df = df.drop_duplicates().dropna()
    df["source"] = [f"{i._meta.model_name}_{i.uid}" for i in df["source_record"]]
    df["target"] = [f"{i._meta.model_name}_{i.uid}" for i in df["target_record"]]
    df["source_label"] = df["source_record"].apply(get_record_label)
    df["target_label"] = df["target_record"].apply(get_record_label)
    return df


================================================
FILE: lamindb/models/project.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, overload

from django.core.validators import RegexValidator
from django.db import models
from django.db.models import CASCADE, PROTECT

from lamindb.base.fields import (
    BigIntegerField,
    CharField,
    DateField,
    DateTimeField,
    ForeignKey,
    TextField,
    URLField,
)
from lamindb.base.users import current_user_id

from ..base.uids import base62_12
from .artifact import Artifact
from .can_curate import CanCurate
from .collection import Collection
from .feature import Feature
from .has_parents import _query_relatives
from .record import Record
from .run import Run, TracksRun, TracksUpdates, User
from .schema import Schema
from .sqlrecord import BaseSQLRecord, HasType, IsLink, SQLRecord, ValidateFields
from .transform import Transform
from .ulabel import ULabel

if TYPE_CHECKING:
    from datetime import date as DateType
    from datetime import datetime

    from .block import Block, ProjectBlock
    from .query_manager import RelatedManager
    from .query_set import QuerySet
    from .sqlrecord import Branch


class Reference(
    SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates, ValidateFields
):
    """References such as internal studies, papers, documents, or URLs.

    Example:

        Create a reference object::

            reference = Reference(
                name="A Paper Title",
                abbr="APT",
                url="https://doi.org/10.1000/xyz123",
                pubmed_id=12345678,
                doi="10.1000/xyz123",
                description="Good paper.",
                text="Some text I want to be searchable.",
                date=date(2023, 11, 21),
            ).save()

    """

    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):
        abstract = False
        app_label = "lamindb"
        # also see raw SQL constraints for `is_type` and `type` FK validity in migrations

    id: int = models.AutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(
        editable=False, unique=True, max_length=12, db_index=True, default=base62_12
    )
    """Universal id, valid across DB instances."""
    name: str = CharField(db_index=True)
    """Title or name of the reference document."""
    description: str | None = TextField(null=True)
    """A description."""
    type: Reference | None = ForeignKey(
        "self", PROTECT, null=True, related_name="references"
    )
    """Type of reference (e.g., 'Study', 'Paper', 'Preprint') ← :attr:`~lamindb.Reference.references`.

    Allows to group reference by type, e.g., internal studies vs. all papers etc.
    """
    references: RelatedManager[Reference]
    """References of this type (can only be non-empty if `is_type` is `True`)."""
    abbr: str | None = CharField(
        max_length=32,
        db_index=True,
        null=True,
    )
    """An abbreviation for the reference."""
    url: str | None = URLField(null=True, db_index=True)
    """URL linking to the reference."""
    pubmed_id: int | None = BigIntegerField(null=True, db_index=True)
    """A PudMmed ID."""
    doi: str | None = CharField(
        null=True,
        db_index=True,
        validators=[
            RegexValidator(
                regex=r"^(?:https?://(?:dx\.)?doi\.org/|doi:|DOI:)?10\.\d+/.*$",
                message="Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)",
            )
        ],
    )
    """Digital Object Identifier (DOI) for the reference."""
    text: str | None = TextField(null=True)
    """Abstract or full text of the reference to make it searchable."""
    date: DateType | None = DateField(null=True, default=None)
    """Date of creation or publication of the reference."""
    artifacts: RelatedManager[Artifact] = models.ManyToManyField(
        Artifact, through="ArtifactReference", related_name="references"
    )
    """Annotated artifacts ← :attr:`~lamindb.Artifact.references`."""
    transforms: RelatedManager[Transform] = models.ManyToManyField(
        Transform, through="TransformReference", related_name="references"
    )
    """Annotated transforms ← :attr:`~lamindb.Transform.references`."""
    collections: RelatedManager[Collection] = models.ManyToManyField(
        Collection, through="CollectionReference", related_name="references"
    )
    """Annotated collections ← :attr:`~lamindb.Collection.references`."""
    linked_in_records: RelatedManager[Record] = models.ManyToManyField(
        Record, through="RecordReference", related_name="linked_references"
    )
    """Linked in records ← :attr:`~lamindb.Record.linked_references`."""
    records: RelatedManager[Record] = models.ManyToManyField(
        Record, through="ReferenceRecord", related_name="references"
    )
    """Annotated records ← :attr:`~lamindb.Record.references`."""
    projects: RelatedManager[Project]
    """Projects that annotate this reference ← :attr:`~lamindb.Project.references`."""

    @overload
    def __init__(
        self,
        name: str,
        type: Reference | None = None,
        is_type: bool = False,
        abbr: str | None = None,
        url: str | None = None,
        pubmed_id: int | None = None,
        doi: str | None = None,
        description: str | None = None,
        text: str | None = None,
        date: DateType | None = None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def query_references(self) -> QuerySet:
        """Query references of sub types.

        While `.references` retrieves the references with the current type, this method
        also retrieves sub types and the references with sub types of the current type.
        """
        return _query_relatives([self], "references")  # type: ignore


class Project(SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates, ValidateFields):
    """Projects to label artifacts, transforms, records, and runs.

    Example:

        Create a project and annotate an artifact with it::

            project = Project(
                name="My Project Name",
                abbr="MPN",
                url="https://example.com/my_project",
            ).save()
            artifact.projects.add(project)  # <-- labels the artifact with the project
            ln.track(project=project)       # <-- automtically labels entities during the run

    """

    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):
        abstract = False
        app_label = "lamindb"
        # also see raw SQL constraints for `is_type` and `type` FK validity in migrations

    id: int = models.AutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(
        editable=False, unique=True, max_length=12, db_index=True, default=base62_12
    )
    """Universal id, valid across DB instances."""
    name: str = CharField(db_index=True)
    """Title or name of the Project."""
    description: str | None = TextField(null=True)
    """A description."""
    type: Project | None = ForeignKey(
        "self", PROTECT, null=True, related_name="projects"
    )
    """Type of project (e.g., 'Program', 'Project', 'GithubIssue', 'Task') ← :attr:`~lamindb.Project.projects`."""
    projects: RelatedManager[Project]
    """Projects of this type (can only be non-empty if `is_type` is `True`)."""
    abbr: str | None = CharField(max_length=32, db_index=True, null=True)
    """An abbreviation."""
    url: str | None = URLField(max_length=255, null=True, default=None)
    """A URL."""
    start_date: DateType | None = DateField(null=True, default=None)
    """Date of start of the project."""
    end_date: DateType | None = DateField(null=True, default=None)
    """Date of start of the project."""
    parents: RelatedManager[Project] = models.ManyToManyField(
        "self", symmetrical=False, related_name="children"
    )
    """Parent projects, the super-projects owning this project ← :attr:`~lamindb.Project.children`."""
    children: RelatedManager[Project]
    """Child projects, the sub-projects owned by this project.

    Reverse accessor for `.parents`.
    """
    predecessors: RelatedManager[Project] = models.ManyToManyField(
        "self", symmetrical=False, related_name="successors"
    )
    """The preceding projects required by this project ← :attr:`~lamindb.Project.successors`."""
    successors: RelatedManager[Project]
    """The succeeding projects requiring this project.

    Reverse accessor for `.predecessors`.
    """
    artifacts: RelatedManager[Artifact] = models.ManyToManyField(
        Artifact, through="ArtifactProject", related_name="projects"
    )
    """Annotated artifacts ← :attr:`~lamindb.Artifact.projects`."""
    transforms: RelatedManager[Transform] = models.ManyToManyField(
        Transform, through="TransformProject", related_name="projects"
    )
    """Annotated transforms ← :attr:`~lamindb.Transform.projects`."""
    runs: RelatedManager[Run] = models.ManyToManyField(
        Run, through="RunProject", related_name="projects"
    )
    """Annotated runs ← :attr:`~lamindb.Run.projects`."""
    ulabels: RelatedManager[ULabel] = models.ManyToManyField(
        ULabel, through="ULabelProject", related_name="projects"
    )
    """Annotated ulabels ← :attr:`~lamindb.ULabel.projects`."""
    features: RelatedManager[Feature] = models.ManyToManyField(
        Feature, through="FeatureProject", related_name="projects"
    )
    """Annotated features ← :attr:`~lamindb.Feature.projects`."""
    schemas: RelatedManager[Schema] = models.ManyToManyField(
        Schema, through="SchemaProject", related_name="projects"
    )
    """Annotated schemas ← :attr:`~lamindb.Schema.projects`."""
    linked_in_records: RelatedManager[Record] = models.ManyToManyField(
        Record, through="RecordProject", related_name="linked_projects"
    )
    """Linked in records ← :attr:`~lamindb.Record.linked_projects`."""
    records: RelatedManager[Record] = models.ManyToManyField(
        Record, through="ProjectRecord", related_name="projects"
    )
    """Annotated records ← :attr:`~lamindb.Record.projects`."""
    collections: RelatedManager[Collection] = models.ManyToManyField(
        Collection, through="CollectionProject", related_name="projects"
    )
    """Annotated collections ← :attr:`~lamindb.Collection.projects`."""
    references: RelatedManager[Reference] = models.ManyToManyField(
        "Reference", related_name="projects"
    )
    """Annotated references ← :attr:`~lamindb.Reference.projects`."""
    blocks: RelatedManager[Block] = models.ManyToManyField(
        "Block", through="BlockProject", related_name="projects"
    )
    """Annotated blocks ← :attr:`~lamindb.Block.projects`."""
    users: RelatedManager[User] = models.ManyToManyField(
        "User",
        through="ProjectUser",
        related_name="projects",
    )
    """Users participating in this project ← :attr:`~lamindb.ProjectUser.user`."""
    branches: RelatedManager[Branch]
    """Annotated branches ← :attr:`~lamindb.BranchProject.project`."""
    _status_code: int = models.SmallIntegerField(default=0, db_default=0, db_index=True)
    """Status code."""
    ablocks: RelatedManager[ProjectBlock]
    """Attached blocks ← :attr:`~lamindb.ProjectBlock.project`."""

    @overload
    def __init__(
        self,
        name: str,
        type: Project | None = None,
        is_type: bool = False,
        abbr: str | None = None,
        url: str | None = None,
        start_date: DateType | None = None,
        end_date: DateType | None = None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def query_projects(self) -> QuerySet:
        """Query projects of sub types.

        While `.projects` retrieves the projects with the current type, this method
        also retrieves sub types and the projects with sub types of the current type.
        """
        return _query_relatives([self], "projects")  # type: ignore


class ArtifactProject(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_project")
    project: Project = ForeignKey(Project, PROTECT, related_name="links_artifact")
    feature: Feature | None = ForeignKey(
        Feature,
        PROTECT,
        null=True,
        default=None,
        related_name="links_artifactproject",
    )

    class Meta:
        app_label = "lamindb"
        # can have the same label linked to the same artifact if the feature is different
        unique_together = ("artifact", "project", "feature")


class RunProject(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    run: Run = ForeignKey(Run, CASCADE, related_name="links_project")
    project: Project = ForeignKey(Project, PROTECT, related_name="links_run")
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of creation of record."""
    created_by: User = ForeignKey(
        "lamindb.User",
        PROTECT,
        editable=False,
        default=current_user_id,
        related_name="+",
    )
    """Creator of record."""

    class Meta:
        app_label = "lamindb"
        unique_together = ("run", "project")


class BranchProject(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    branch: Branch = ForeignKey("Branch", CASCADE, related_name="links_project")
    project: Project = ForeignKey(Project, PROTECT, related_name="links_branch")

    class Meta:
        app_label = "lamindb"
        unique_together = ("branch", "project")


class ProjectUser(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    project: Project = ForeignKey(Project, CASCADE, related_name="links_user")
    user: User = ForeignKey("User", PROTECT, related_name="links_project")
    role: str = CharField(max_length=32, db_index=True)
    """Role (e.g. "responsible", "viewer")."""

    class Meta:
        app_label = "lamindb"
        unique_together = ("project", "user", "role")


class TransformProject(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_project")
    project: Project = ForeignKey(Project, PROTECT, related_name="links_transform")

    class Meta:
        app_label = "lamindb"
        unique_together = ("transform", "project")


class CollectionProject(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    collection: Collection = ForeignKey(
        Collection, CASCADE, related_name="links_project"
    )
    project: Project = ForeignKey(Project, PROTECT, related_name="links_collection")

    class Meta:
        app_label = "lamindb"
        unique_together = ("collection", "project")


class ULabelProject(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    ulabel: ULabel = ForeignKey(ULabel, CASCADE, related_name="links_project")
    project: Project = ForeignKey(Project, PROTECT, related_name="links_ulabel")

    class Meta:
        app_label = "lamindb"
        unique_together = ("ulabel", "project")


class FeatureProject(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    feature: Feature = ForeignKey(Feature, CASCADE, related_name="links_project")
    project: Project = ForeignKey(Project, PROTECT, related_name="links_feature")

    class Meta:
        app_label = "lamindb"
        unique_together = ("feature", "project")


class SchemaProject(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_project")
    project: Project = ForeignKey(Project, PROTECT, related_name="links_schema")

    class Meta:
        app_label = "lamindb"
        unique_together = ("schema", "project")


# for annotation of records with references, RecordReference is for storing reference values
class ReferenceRecord(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    reference: Reference = ForeignKey(Reference, PROTECT, related_name="links_record")
    feature: Feature | None = ForeignKey(
        Feature,
        PROTECT,
        null=True,
        default=None,
        related_name="links_referencerecord",
    )
    record: Record = ForeignKey(Record, CASCADE, related_name="links_reference")

    class Meta:
        app_label = "lamindb"
        unique_together = ("reference", "feature", "record")


class RecordReference(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    record: Record = ForeignKey(Record, CASCADE, related_name="values_reference")
    feature: Feature = ForeignKey(
        Feature, PROTECT, related_name="links_recordreference"
    )
    value: Reference = ForeignKey(Reference, PROTECT, related_name="links_in_record")

    class Meta:
        app_label = "lamindb"
        unique_together = ("record", "feature", "value")


# for annotation of records with projects, RecordProject is for storing project values
class ProjectRecord(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    project: Project = ForeignKey(Project, PROTECT, related_name="links_record")
    feature: Feature | None = ForeignKey(
        Feature,
        PROTECT,
        null=True,
        default=None,
        related_name="links_projectrecord",
    )
    record: Record = ForeignKey(Record, CASCADE, related_name="links_project")

    class Meta:
        app_label = "lamindb"
        unique_together = ("project", "feature", "record")


class RecordProject(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    record: Record = ForeignKey(Record, CASCADE, related_name="values_project")
    feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordproject")
    value: Project = ForeignKey(Project, PROTECT, related_name="links_in_record")

    class Meta:
        app_label = "lamindb"
        unique_together = ("record", "feature", "value")


class BlockProject(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    block = ForeignKey("Block", CASCADE, related_name="links_project")
    project: Project = ForeignKey(Project, PROTECT, related_name="links_block")

    class Meta:
        app_label = "lamindb"
        unique_together = ("block", "project")


class ArtifactReference(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_reference")
    reference: Reference = ForeignKey(Reference, PROTECT, related_name="links_artifact")
    feature: Feature | None = ForeignKey(
        Feature,
        PROTECT,
        null=True,
        default=None,
        related_name="links_artifactreference",
    )

    class Meta:
        app_label = "lamindb"
        unique_together = ("artifact", "reference", "feature")


class TransformReference(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    transform: Transform = ForeignKey(
        Transform, CASCADE, related_name="links_reference"
    )
    reference: Reference = ForeignKey(
        Reference, PROTECT, related_name="links_transform"
    )

    class Meta:
        app_label = "lamindb"
        unique_together = ("transform", "reference")


class CollectionReference(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    collection: Collection = ForeignKey(
        Collection, CASCADE, related_name="links_reference"
    )
    reference: Reference = ForeignKey(
        Reference, PROTECT, related_name="links_collection"
    )

    class Meta:
        app_label = "lamindb"
        unique_together = ("collection", "reference")


================================================
FILE: lamindb/models/query_manager.py
================================================
from __future__ import annotations

import re
from functools import reduce
from typing import TYPE_CHECKING, Literal, NamedTuple

from django.db.models import (
    IntegerField,
    Manager,
    Q,
    QuerySet,
    TextField,
    Value,
)
from django.db.models.functions import Cast, Coalesce
from django.db.models.lookups import (
    Contains,
    Exact,
    IContains,
    IExact,
    IRegex,
    IStartsWith,
    Regex,
    StartsWith,
)
from lamin_utils._lookup import Lookup
from lamindb_setup.core import deprecated
from lamindb_setup.core._docs import doc_args

if TYPE_CHECKING:
    from ..base.types import StrField


def _search(
    cls,
    string: str,
    *,
    field: StrField | list[StrField] | None = None,
    limit: int | None = 20,
    case_sensitive: bool = False,
    truncate_string: bool = False,
) -> QuerySet:
    """Search.

    Args:
        string: The input string to match against the field ontology values.
        field: The field or fields to search. Search all string fields by default.
        limit: Maximum amount of top results to return.
        case_sensitive: Whether the match is case sensitive.

    Returns:
        A sorted `DataFrame` of search results with a score in column `score`.
        If `return_queryset` is `True`.  `QuerySet`.

    See Also:
        :meth:`~lamindb.models.SQLRecord.filter`
        :meth:`~lamindb.models.SQLRecord.lookup`

    Examples:

        ::

            records = ln.ULabel.from_values(["Label1", "Label2", "Label3"]).save()
            ln.ULabel.search("Label2")
    """
    if string is None:
        raise ValueError("Cannot search for None value! Please pass a valid string.")

    input_queryset = (
        cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all()
    )
    registry = input_queryset.model
    name_field = getattr(registry, "_name_field", "name")
    if field is None:
        fields = [
            field.name
            for field in registry._meta.fields
            if field.get_internal_type() in {"CharField", "TextField"}
        ]
    else:
        if not isinstance(field, list):
            fields_input = [field]
        else:
            fields_input = field
        fields = []
        for field in fields_input:
            if not isinstance(field, str):
                try:
                    fields.append(field.field.name)
                except AttributeError as error:
                    raise TypeError(
                        "Please pass a SQLRecord string field, e.g., `CellType.name`!"
                    ) from error
            else:
                fields.append(field)

    if truncate_string:
        if (len_string := len(string)) > 5:
            n_80_pct = int(len_string * 0.8)
            string = string[:n_80_pct]

    string = string.strip()
    string_escape = re.escape(string)

    exact_lookup = Exact if case_sensitive else IExact
    regex_lookup = Regex if case_sensitive else IRegex
    contains_lookup = Contains if case_sensitive else IContains

    ranks = []
    contains_filters = []
    for field in fields:
        field_expr = Coalesce(
            Cast(field, output_field=TextField()),
            Value(""),
            output_field=TextField(),
        )
        # exact rank
        exact_expr = exact_lookup(field_expr, string)
        exact_rank = Cast(exact_expr, output_field=IntegerField()) * 200
        ranks.append(exact_rank)
        # exact synonym
        synonym_expr = regex_lookup(field_expr, rf"(?:^|.*\|){string_escape}(?:\|.*|$)")
        synonym_rank = Cast(synonym_expr, output_field=IntegerField()) * 200
        ranks.append(synonym_rank)
        # match as sub-phrase
        sub_expr = regex_lookup(
            field_expr, rf"(?:^|.*[ \|\.,;:]){string_escape}(?:[ \|\.,;:].*|$)"
        )
        sub_rank = Cast(sub_expr, output_field=IntegerField()) * 10
        ranks.append(sub_rank)
        # startswith and avoid matching string with " " on the right
        # mostly for truncated
        startswith_expr = regex_lookup(
            field_expr, rf"(?:^|.*\|){string_escape}[^ ]*(?:\|.*|$)"
        )
        startswith_rank = Cast(startswith_expr, output_field=IntegerField()) * 8
        ranks.append(startswith_rank)
        # match as sub-phrase from the left, mostly for truncated
        right_expr = regex_lookup(field_expr, rf"(?:^|.*[ \|]){string_escape}.*")
        right_rank = Cast(right_expr, output_field=IntegerField()) * 2
        ranks.append(right_rank)
        # match as sub-phrase from the right
        left_expr = regex_lookup(field_expr, rf".*{string_escape}(?:$|[ \|\.,;:].*)")
        left_rank = Cast(left_expr, output_field=IntegerField()) * 2
        ranks.append(left_rank)
        # simple contains filter
        contains_expr = contains_lookup(field_expr, string)
        contains_filter = Q(contains_expr)
        contains_filters.append(contains_filter)
        # also rank by contains
        contains_rank = Cast(contains_expr, output_field=IntegerField())
        ranks.append(contains_rank)
        # additional rule for truncated strings
        # weight matches from the beginning of the string higher
        # sometimes whole words get truncated and startswith_expr is not enough
        if truncate_string and field == name_field:
            startswith_lookup = StartsWith if case_sensitive else IStartsWith
            name_startswith_expr = startswith_lookup(field_expr, string)
            name_startswith_rank = (
                Cast(name_startswith_expr, output_field=IntegerField()) * 2
            )
            ranks.append(name_startswith_rank)

    ranked_queryset = (
        input_queryset.filter(reduce(lambda a, b: a | b, contains_filters))
        .alias(rank=sum(ranks))
        .order_by("-rank")
    )

    return ranked_queryset[:limit]


def _lookup(
    cls,
    field: StrField | None = None,
    return_field: StrField | None = None,
    using_key: str | None = None,
    keep: Literal["first", "last", False] = "first",
) -> NamedTuple:
    """Return an auto-complete object for a field.

    Args:
        field: The field to look up the values for. Defaults to first string field.
        return_field: The field to return. If `None`, returns the whole record.
        keep: When multiple records are found for a lookup, how to return the records.
            - `"first"`: return the first record.
            - `"last"`: return the last record.
            - `False`: return all records.

    Returns:
        A `NamedTuple` of lookup information of the field values with a
        dictionary converter.

    See Also:
        :meth:`~lamindb.models.SQLRecord.search`

    Examples:

        Lookup via auto-complete on `.`::

            import bionty as bt
            bt.Gene.from_source(symbol="ADGB-DT").save()
            lookup = bt.Gene.lookup()
            lookup.adgb_dt

        Look up via auto-complete in dictionary::

            lookup_dict = lookup.dict()
            lookup_dict['ADGB-DT']

        Look up via a specific field::

            lookup_by_ensembl_id = bt.Gene.lookup(field="ensembl_gene_id")
            genes.ensg00000002745

        Return a specific field value instead of the full record::

            lookup_return_symbols = bt.Gene.lookup(field="ensembl_gene_id", return_field="symbol")
    """
    from .sqlrecord import get_name_field

    queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all()
    field = get_name_field(registry=queryset.model, field=field)

    return Lookup(
        records=queryset,
        values=[i.get(field) for i in queryset.values()],
        tuple_name=cls.__class__.__name__,
        prefix="ln",
        keep=keep,
    ).lookup(
        return_field=(
            get_name_field(registry=queryset.model, field=return_field)
            if return_field is not None
            else None
        )
    )


# this is the default (._default_manager and ._base_manager) for lamindb models
class QueryManager(Manager):
    """Manage queries through fields.

    See Also:

        :class:`lamindb.models.QuerySet`

        `django Manager <https://docs.djangoproject.com/en/4.2/topics/db/managers/>`__

    Examples:

        Populate the `.parents` ManyToMany relationship (a `QueryManager`)::

            ln.ULabel.from_values(["Label1", "Label2", "Label3"]).save()
            labels = ln.ULabel.filter(name__icontains="label")
            label1 = ln.ULabel.get(name="Label1")
            label1.parents.set(labels)

        Convert all linked parents to a `DataFrame`::

            label1.parents.to_dataframe()
    """

    def to_list(self, field: str | None = None):
        """Populate a list."""
        if field is None:
            return list(self.all())
        else:
            return list(self.values_list(field, flat=True))

    def to_dataframe(self, **kwargs):
        """Convert to DataFrame.

        For `**kwargs`, see :meth:`lamindb.models.QuerySet.to_dataframe`.
        """
        return self.all().to_dataframe(**kwargs)

    @deprecated(new_name="to_dataframe")
    def df(self, **kwargs):
        return self.to_dataframe(**kwargs)

    @doc_args(_search.__doc__)
    def search(self, string: str, **kwargs):
        """{}"""  # noqa: D415
        return _search(cls=self.all(), string=string, **kwargs)

    @doc_args(_lookup.__doc__)
    def lookup(self, field: StrField | None = None, **kwargs) -> NamedTuple:
        """{}"""  # noqa: D415
        return _lookup(cls=self.all(), field=field, **kwargs)

    def get_queryset(self):
        from .query_set import BasicQuerySet

        # QueryManager returns BasicQuerySet because it is problematic to redefine .filter and .get
        # for a query set used by the default manager
        return BasicQuerySet(model=self.model, using=self._db, hints=self._hints)


# below is just for typing / docs
# Django achieves the same thing with a dynamically generated class
class RelatedManager(QueryManager):
    """Manager for many-to-many and reverse foreign key relationships.

    Provides relationship manipulation methods.

    See Also:
        :class:`lamindb.models.QueryManager`

    Examples:

        Populate the `.parents` ManyToMany relationship (a `RelatedManager`)::

            ln.ULabel.from_values(["Label1", "Label2", "Label3"]).save()
            labels = ln.ULabel.filter(name__icontains="label")
            label1 = ln.ULabel.get(name="Label1")
            label1.parents.set(labels)

        Convert all linked parents to a `DataFrame`::

            label1.parents.to_dataframe()

        Remove a parent label::

            label1.parents.remove(label2)

        Clear all parent labels::

            label1.parents.clear()

    """

    def add(self, *objs, bulk: bool = True) -> None:
        """Add objects to the relationship."""
        ...

    def set(self, objs, *, bulk: bool = True, clear: bool = False) -> None:
        """Set the relationship to the specified objects."""
        ...

    def remove(self, *objs, bulk: bool = True) -> None:
        """Remove objects from the relationship."""
        ...

    def clear(self) -> None:
        """Remove all objects from the relationship."""
        ...


================================================
FILE: lamindb/models/query_set.py
================================================
from __future__ import annotations

import ast
import re
import warnings
from collections import UserList, defaultdict
from collections.abc import Iterable
from collections.abc import Iterable as IterableType
from importlib import import_module
from typing import TYPE_CHECKING, Any, Generic, NamedTuple, TypeVar, final

import lamindb_setup as ln_setup
from django.core.exceptions import FieldError
from django.db import models
from django.db.models import (
    F,
    FilteredRelation,
    ForeignKey,
    ManyToManyField,
    Q,
    Subquery,
)
from django.db.models.fields.related import ForeignObjectRel
from lamin_utils import logger
from lamindb_setup import settings as setup_settings
from lamindb_setup.core import deprecated
from lamindb_setup.core._docs import doc_args

from ..base.types import BRANCH_STATUS_TO_CODE, RUN_STATUS_TO_CODE
from ..errors import DoesNotExist, MultipleResultsFound
from ._is_versioned import IsVersioned, _adjust_is_latest_when_deleting_is_versioned
from .can_curate import CanCurate, _inspect, _standardize, _validate
from .query_manager import _lookup, _search
from .sqlrecord import Registry, SQLRecord

if TYPE_CHECKING:
    import pandas as pd
    from bionty.models import (
        CellLine,
        CellMarker,
        CellType,
        DevelopmentalStage,
        Disease,
        Ethnicity,
        ExperimentalFactor,
        Gene,
        Organism,
        Pathway,
        Phenotype,
        Protein,
        Tissue,
    )
    from pertdb.models import (
        Biologic,
        CombinationPerturbation,
        Compound,
        CompoundPerturbation,
        EnvironmentalPerturbation,
        GeneticPerturbation,
        PerturbationTarget,
    )

    from lamindb.base.types import ListLike, StrField
    from lamindb.models import (
        Artifact,
        Branch,
        Collection,
        Feature,
        Project,
        Record,
        Reference,
        Run,
        Schema,
        Space,
        Storage,
        Transform,
        ULabel,
        User,
    )

T = TypeVar("T")


def get_keys_from_df(data: list, registry: SQLRecord) -> list[str]:
    if len(data) > 0:
        if isinstance(data[0], dict):
            keys = list(data[0].keys())
        else:
            keys = list(data[0].__dict__.keys())
            if "_state" in keys:
                keys.remove("_state")
    else:
        keys = [
            field.name
            for field in registry._meta.fields
            if not isinstance(field, models.ForeignKey)
        ]
        keys += [
            f"{field.name}_id"
            for field in registry._meta.fields
            if isinstance(field, models.ForeignKey)
        ]
    return keys


def get_default_branch_ids(branch: Branch | None = None) -> list[int]:
    """Return branch IDs to include in default queries.

    By default, queries include records on the main branch (branch_id=1) but exclude trashed (branch_id=-1)
    and archived records (branch_id=0). This matches behavior of familiar tools like GitHub, Slack, and
    email clients.

    If a user switches to another branch via `lamin switch branch`, the main branch will still be included.

    Returns:
        List containing the default branch and current branch if different.
    """
    if branch is None:
        branch_id = setup_settings.branch.id
    else:
        branch_id = branch.id
    branch_ids = [branch_id]
    if branch_id != 1:  # add the main branch by default
        branch_ids.append(1)
    return branch_ids


def one_helper(
    self: QuerySet | SQLRecordList,
    does_not_exist_msg: str | None = None,
    raise_doesnotexist: bool = True,
    not_exists: bool | None = None,
    raise_multipleresultsfound: bool = True,
):
    if not_exists is None:
        if isinstance(self, SQLRecordList):
            not_exists = len(self) == 0
        else:
            not_exists = not self.exists()  # type: ignore
    if not_exists:
        if raise_doesnotexist:
            raise DoesNotExist(does_not_exist_msg)
        else:
            return None
    elif len(self) > 1:
        if raise_multipleresultsfound:
            raise MultipleResultsFound(self)
        else:
            return self[0]
    else:
        return self[0]


def get_backward_compat_filter_kwargs(queryset, expressions):
    from lamindb.models import (
        Artifact,
        Branch,
        Feature,
        Project,
        Run,
    )

    if issubclass(queryset.model, IsVersioned):
        name_mappings = {
            "version": "version_tag",
        }
    else:
        name_mappings = {}

    if queryset.model is Artifact:
        name_mappings.update(
            {
                "transform": "run__transform",
                "feature_sets": "schemas",
            }
        )
    if queryset.model is Feature:
        name_mappings.update(
            {
                "dtype": "_dtype_str",
                "dtype_as_str": "_dtype_str",
            }
        )
    if queryset.model in {Run, Branch, Project}:
        name_mappings.update(
            {
                "status": "_status_code",
            }
        )

    # If no mappings to apply, return expressions as-is
    if not name_mappings:
        return expressions
    was_list = False
    if isinstance(expressions, list):
        was_list = True
        expressions = {field: True for field in expressions}
    mapped = {}
    status_mapping = None
    if queryset.model is Run:
        status_mapping = RUN_STATUS_TO_CODE
    elif queryset.model is Branch:
        status_mapping = BRANCH_STATUS_TO_CODE

    def _map_status_value(value):
        if status_mapping is None:
            return value
        if isinstance(value, str):
            if value not in status_mapping:
                expected = ", ".join(f"'{status}'" for status in status_mapping)
                raise ValueError(
                    f"Invalid {queryset.model.__name__} status '{value}'. "
                    f"Expected one of: {expected}."
                )
            return status_mapping[value]
        if isinstance(value, IterableType) and not isinstance(value, str):
            return [
                status_mapping[v] if isinstance(v, str) and v in status_mapping else v
                for v in value
            ]
        return value

    for field, value in expressions.items():
        parts = field.split("__")
        if parts[0] in name_mappings:
            # Issue deprecation warnings
            if queryset.model is Artifact and parts[0] == "feature_sets":
                warnings.warn(
                    "Querying Artifact by `feature_sets` is deprecated. Use `schemas` instead.",
                    DeprecationWarning,
                    stacklevel=4,
                )
            elif queryset.model is Feature and parts[0] == "dtype":
                warnings.warn(
                    "Querying Feature by `dtype` is deprecated. Use `dtype_as_str` instead. "
                    "Notice the new dtype encoding format for Record and ULabel subtypes.",
                    DeprecationWarning,
                    stacklevel=4,
                )
            new_field = name_mappings[parts[0]] + (
                "__" + "__".join(parts[1:]) if len(parts) > 1 else ""
            )
            mapped[new_field] = (
                _map_status_value(value) if parts[0] == "status" else value
            )
        else:
            mapped[field] = value
    return list(mapped.keys()) if was_list else mapped


def process_expressions(queryset: QuerySet, queries: tuple, expressions: dict) -> dict:
    def _map_databases(value: Any, key: str, target_db: str) -> tuple[str, Any]:
        if isinstance(value, SQLRecord):
            if value._state.db != target_db:
                logger.warning(
                    f"passing record from database {value._state.db} to query {target_db}, matching on uid '{value.uid}'"
                )
                return f"{key}__uid", value.uid
            return key, value

        if (
            key.endswith("__in")
            and isinstance(value, IterableType)
            and not isinstance(value, str)
        ):
            if any(
                isinstance(v, SQLRecord) and v._state.db != target_db for v in value
            ):
                logger.warning(
                    f"passing records from another database to query {target_db}, matching on uids"
                )
                return key.replace("__in", "__uid__in"), [
                    v.uid if isinstance(v, SQLRecord) else v for v in value
                ]
            return key, value

        return key, value

    branch_fields = {"branch", "branch_id"}
    branch_prefixes = ("branch__", "branch_id__")

    def queries_contain_branch(queries: tuple) -> bool:
        """Check if any Q object in queries references branch or branch_id."""

        def check_q_object(q: Q) -> bool:
            # Q objects store their conditions in q.children
            for child in q.children:
                if isinstance(child, tuple) and len(child) == 2:
                    # Normal condition: (key, value)
                    key = child[0]
                    if key in branch_fields or key.startswith(branch_prefixes):
                        return True
                elif isinstance(child, Q):
                    # Nested Q object
                    if check_q_object(child):
                        return True
            return False

        return any(check_q_object(q) for q in queries if isinstance(q, Q))

    expressions = get_backward_compat_filter_kwargs(
        queryset,
        expressions,
    )
    model_has_branch = any(
        field.name == "branch" for field in queryset.model._meta.concrete_fields
    )
    if issubclass(queryset.model, SQLRecord) or model_has_branch:
        # branch_id is set to 1 unless expressions contains id, uid or hash
        id_uid_hash = {"id", "uid", "hash", "id__in", "uid__in", "hash__in"}
        if not any(expression in id_uid_hash for expression in expressions):
            expressions_have_branch = False
            for expression in expressions:
                if expression in branch_fields or expression.startswith(
                    branch_prefixes
                ):
                    expressions_have_branch = True
                    break
            if not expressions_have_branch and not queries_contain_branch(queries):
                expressions["branch_id__in"] = get_default_branch_ids()
            else:
                # if branch_id is None, do not apply a filter
                # otherwise, it would mean filtering for NULL values, which doesn't make
                # sense for a non-NULLABLE column
                if "branch_id" in expressions and expressions["branch_id"] is None:
                    expressions.pop("branch_id")
                if "branch" in expressions and expressions["branch"] is None:
                    expressions.pop("branch")

    if queryset._db is not None:
        # only check for database mismatch if there is a defined database on the
        # queryset
        return dict(
            (
                _map_databases(value, key, queryset._db)
                for key, value in expressions.items()
            )
        )
    else:
        return expressions


def get(
    registry_or_queryset: Registry | BasicQuerySet,
    idlike: int | str | None = None,
    **expressions,
) -> SQLRecord:
    if isinstance(registry_or_queryset, BasicQuerySet):
        # not QuerySet but only BasicQuerySet
        assert not isinstance(registry_or_queryset, QuerySet)  # noqa: S101

        qs = registry_or_queryset
        registry = qs.model
    else:
        qs = BasicQuerySet(model=registry_or_queryset)
        registry = registry_or_queryset

    if isinstance(idlike, int):
        return qs.get(id=idlike)
    elif isinstance(idlike, str):
        NAME_FIELD = (
            registry._name_field if hasattr(registry, "_name_field") else "name"
        )
        DOESNOTEXIST_MSG = f"No record found with uid '{idlike}'. Did you forget a keyword as in {registry.__name__}.get({NAME_FIELD}='{idlike}')?"
        # this is the case in which the user passes an under-specified uid
        if issubclass(registry, IsVersioned) and len(idlike) <= registry._len_stem_uid:
            new_qs = qs.filter(uid__startswith=idlike, is_latest=True)
            not_exists = None
            if not new_qs.exists():
                # also try is_latest is False due to nothing found
                new_qs = qs.filter(uid__startswith=idlike, is_latest=False)
            else:
                not_exists = False
            # it doesn't make sense to raise MultipleResultsFound when querying with an
            # underspecified uid
            return one_helper(
                new_qs,
                DOESNOTEXIST_MSG,
                not_exists=not_exists,
                raise_multipleresultsfound=False,
            )
        else:
            qs = qs.filter(uid__startswith=idlike)
            return one_helper(qs, DOESNOTEXIST_MSG)
    else:
        assert idlike is None  # noqa: S101
        expressions = process_expressions(qs, [], expressions)
        # inject is_latest for consistency with idlike
        is_latest_was_not_in_expressions = "is_latest" not in expressions
        if issubclass(registry, IsVersioned) and is_latest_was_not_in_expressions:
            expressions["is_latest"] = True
        try:
            return qs.get(**expressions)
        except registry.DoesNotExist as e:
            # handle the case in which the is_latest injection led to a missed query
            if "is_latest" in expressions and is_latest_was_not_in_expressions:
                expressions.pop("is_latest")
                result = qs.filter(**expressions).order_by("-created_at").first()
                if result is not None:
                    return result
            raise e


class SQLRecordList(UserList, Generic[T]):
    """Is ordered, can't be queried, but has `.to_dataframe()`."""

    def __init__(self, records: Iterable[T]):
        if isinstance(records, list):
            self.data = records  # Direct assignment if already a list, no copy
        else:
            super().__init__(records)  # Let UserList handle the conversion

    def to_dataframe(self) -> pd.DataFrame:
        import pandas as pd

        keys = get_keys_from_df(self.data, self.data[0].__class__)
        values = [record.__dict__ for record in self.data]
        return pd.DataFrame(values, columns=keys)

    @deprecated(new_name="to_dataframe")
    def df(self) -> pd.DataFrame:
        return self.to_dataframe()

    def to_list(
        self, field: str | None = None
    ) -> list[str]:  # meaningful to be parallel with to_list() in QuerySet
        if field is None:
            return self.data
        return [getattr(record, field) for record in self.data]

    def one(self) -> T:
        """Exactly one result. Throws error if there are more or none."""
        return one_helper(self)

    def save(self) -> SQLRecordList[T]:
        """Save all records to the database."""
        from lamindb.models.save import save

        save(self)
        return self


def get_basic_field_names(
    qs: QuerySet,
    include: list[str],
    features_input: bool | list[str] | str,
) -> list[str]:
    exclude_field_names = ["updated_at"]
    include_private_fields = False
    if "privates" in include:
        include_private_fields = True
        include.remove("privates")
    field_names = [
        field.name
        for field in qs.model._meta.fields
        if (
            not isinstance(field, models.ForeignKey)
            and field.name not in exclude_field_names
            and (
                not field.name.startswith("_")
                or include_private_fields
                or (field.name == "_dtype_str" and qs.model.__name__ == "Feature")
            )
        )
    ]
    # TODO: harmonize with L1023 in sqlrecord.py
    for field_name in [
        "version_tag",
        "is_latest",
        "is_locked",
        "is_type",
        "created_at",
        "updated_at",
        "created_on",
    ]:
        if field_name in field_names:
            field_names.append(field_names.pop(field_names.index(field_name)))
    field_names += [
        f"{field.name}_id"
        for field in qs.model._meta.fields
        if isinstance(field, models.ForeignKey)
    ]
    # move uid to first position if present
    if "uid" in field_names:
        field_names.insert(0, field_names.pop(field_names.index("uid")))

    # move primary key to second position if present
    pk = qs.model._meta.pk.name if qs.model._meta.pk else None
    if pk and pk in field_names:
        field_names.insert(1, field_names.pop(field_names.index(pk)))
    if (
        include or features_input
    ):  # if there is features_input, reduce fields to just the first 3
        subset_field_names = field_names[:3]
        intersection = set(field_names) & set(include)
        subset_field_names += list(intersection)
        field_names = subset_field_names
    return field_names


def get_feature_annotate_kwargs(
    registry: Registry,
    features: bool | list[str] | str | None,
    qs: QuerySet | None = None,
) -> tuple[dict[str, Any], QuerySet, dict[str, Any]]:
    from lamindb.models import (
        Artifact,
        Feature,
        Record,
        RecordJson,
        Run,
        ULabel,
    )
    from lamindb.models.feature import parse_dtype

    if registry not in {Artifact, Record, Run}:
        raise ValueError(
            f'include="features" is only applicable for Artifact, Record, and Run, not {registry.__name__}'
        )

    feature_ids = []
    if features == "queryset":
        ids_list = qs.values_list("id", flat=True)
        for obj in registry._meta.related_objects:
            related_name_attr = getattr(registry, obj.related_name, None)
            if related_name_attr is None or not hasattr(related_name_attr, "through"):
                continue
            link_model = related_name_attr.through
            if (
                not hasattr(link_model, "feature")
                or link_model.__name__ == "Record_parents"
            ):
                continue
            filter_field = registry.__name__.lower()
            if not hasattr(link_model, filter_field):
                potential_fields = []
                for field in link_model._meta.get_fields():
                    if field.is_relation and field.related_model is registry:
                        potential_fields.append(field.name)
                if len(potential_fields) == 1:
                    filter_field = potential_fields[0]
                else:
                    continue
            links = link_model.objects.using(qs.db).filter(
                **{filter_field + "_id__in": ids_list}
            )
            feature_ids_for_link_model = links.values_list("feature__id", flat=True)
            feature_ids += feature_ids_for_link_model
        if registry is Record:
            # this request is not strictly necessary, but it makes the resulting reshaped
            # dataframe consistent
            feature_ids += RecordJson.filter(record_id__in=ids_list).values_list(
                "feature__id", flat=True
            )
        feature_ids = list(set(feature_ids))  # remove duplicates

    feature_qs = Feature.connect(None if qs is None else qs.db).filter(
        _dtype_str__isnull=False
    )
    if isinstance(features, list):
        feature_qs = feature_qs.filter(name__in=features)
        if len(features) != feature_qs.count():
            logger.warning(
                f"found features and passed features differ:\n - passed: {features}\n - found: {feature_qs.to_list('name')}"
            )
    elif feature_ids:
        feature_qs = feature_qs.filter(id__in=feature_ids)
    else:
        feature_qs = feature_qs.filter(
            ~Q(_dtype_str__startswith="cat[")
            | Q(_dtype_str__startswith="cat[ULabel")
            | Q(_dtype_str__startswith="cat[Record")
        )
        logger.important(
            f"queried for all categorical features of dtypes Record or ULabel and non-categorical features: ({len(feature_qs)}) {feature_qs.to_list('name')}"
        )
    # Duplicate feature names map to ambiguous dataframe columns. We keep a single
    # feature per name for query annotation and warn loudly to surface this.
    feature_name_to_ids: dict[str, list[int]] = defaultdict(list)
    for feature in feature_qs.order_by("id"):
        feature_name_to_ids[feature.name].append(feature.id)
    duplicate_feature_names = {
        name: ids for name, ids in feature_name_to_ids.items() if len(ids) > 1
    }
    if duplicate_feature_names:
        logger.warning(
            "detected duplicate feature names while building dataframe features; "
            "keeping the first feature per name by ascending id. "
            f"duplicates: {duplicate_feature_names}"
        )
        unique_feature_ids = [ids[0] for ids in feature_name_to_ids.values()]
        feature_qs = feature_qs.filter(id__in=unique_feature_ids)
    # Get the categorical features
    cat_feature_types = {
        parse_dtype(feature._dtype_str)[0]["registry_str"]
        for feature in feature_qs
        if feature._dtype_str.startswith("cat[")
        or feature._dtype_str.startswith("list[cat[")
    }
    # fields to annotate
    cat_feature_fields = defaultdict(list)
    for feature in feature_qs:
        dtype_str = feature._dtype_str
        if dtype_str.startswith("cat[") or dtype_str.startswith("list[cat["):
            dtype_info = parse_dtype(dtype_str)[0]
            registry_str = dtype_info["registry_str"]
            field_name = dtype_info["field_str"]
            cat_feature_fields[registry_str].append(field_name)
    # Get relationships of labels and features
    link_models_on_models = {
        getattr(
            registry, obj.related_name
        ).through.__get_name_with_module__(): obj.related_model
        for obj in registry._meta.related_objects
        if obj.related_model.__get_name_with_module__() in cat_feature_types
        and hasattr(getattr(registry, obj.related_name), "through")
        and hasattr(getattr(registry, obj.related_name).through, "feature_id")
    }
    if registry is Artifact:
        link_models_on_models["ArtifactULabel"] = ULabel
    elif registry is Record:
        link_models_on_models["RecordRecord"] = Record
    link_attributes_on_models = {
        obj.related_name: link_models_on_models[
            obj.related_model.__get_name_with_module__()
        ]
        for obj in registry._meta.related_objects
        if (
            obj.related_model.__get_name_with_module__() in link_models_on_models
            and (
                not obj.related_name.startswith("links_record")
                if registry is Record
                else True
            )
        )
    }
    # Prepare Django's annotate for features with filtering
    filtered_relations = {}
    annotate_kwargs = {}

    for link_attr, feature_type_model in link_attributes_on_models.items():
        feature_type = feature_type_model.__get_name_with_module__()
        if link_attr == "links_project" and registry is Record:
            # we're only interested in _values_project when "annotating" records
            continue

        # Determine field name
        if registry in {Artifact, Run}:
            field_name = (
                feature_type.split(".")[1] if "." in feature_type else feature_type
            ).lower()
        else:
            field_name = "value"

        # Determine if this value model needs branch filtering
        # Skip user relations (RecordUser, ArtifactUser don't have branch)
        should_filter_branch = link_attr not in {"values_user", "links_user"}

        # Create filtered relation for the value model
        value_relation_path = f"{link_attr}__{field_name}"
        filtered_value_relation_name = f"filtered_{link_attr}_{field_name}"

        if should_filter_branch:
            filtered_relations[filtered_value_relation_name] = FilteredRelation(
                value_relation_path,
                condition=Q(
                    **{
                        f"{value_relation_path}__branch_id__in": get_default_branch_ids()
                    }
                ),
            )
        else:
            # No branch filtering needed
            filtered_relations[filtered_value_relation_name] = FilteredRelation(
                value_relation_path
            )

        # Add annotation for feature name (feature doesn't have branch_id)
        annotate_kwargs[f"{link_attr}__feature__name"] = F(
            f"{link_attr}__feature__name"
        )

        # Add annotations for categorical feature fields using the filtered relation
        for field in cat_feature_fields[feature_type]:
            annotate_kwargs[f"{link_attr}__{field_name}__{field}"] = F(
                f"{filtered_value_relation_name}__{field}"
            )

    # Handle JSON values (no branch filtering needed)
    json_values_attribute = (
        "json_values" if registry in {Artifact, Run} else "values_json"
    )
    annotate_kwargs[f"{json_values_attribute}__feature__name"] = F(
        f"{json_values_attribute}__feature__name"
    )
    annotate_kwargs[f"{json_values_attribute}__value"] = F(
        f"{json_values_attribute}__value"
    )

    return annotate_kwargs, feature_qs, filtered_relations


# https://claude.ai/share/16280046-6ae5-4f6a-99ac-dec01813dc3c
def analyze_lookup_cardinality(
    model_class: SQLRecord, lookup_paths: list[str] | None
) -> dict[str, str]:
    """Analyze lookup cardinality.

    Analyzes Django model lookups to determine if they will result in
    one-to-one or one-to-many relationships when used in annotations.

    Args:
        model_class: The Django model class to analyze
        include: List of lookup paths (e.g. ["created_by__name", "ulabels__name"])

    Returns:
        Dictionary mapping lookup paths to either 'one' or 'many'
    """
    result = {}  # type: ignore
    if lookup_paths is None:
        return result
    for lookup_path in lookup_paths:
        parts = lookup_path.split("__")
        current_model = model_class
        is_many = False

        # Walk through each part of the lookup path
        for part in parts[:-1]:  # Exclude the last part as it's an attribute
            field = None

            # Handle reverse relations
            for f in current_model._meta.get_fields():
                if isinstance(f, ForeignObjectRel) and f.get_accessor_name() == part:
                    field = f
                    is_many = not f.one_to_one
                    if hasattr(f, "field"):
                        current_model = f.field.model
                    break

            # Handle forward relations
            if field is None:
                field = current_model._meta.get_field(part)
                if isinstance(field, ManyToManyField):
                    is_many = True
                    current_model = field.remote_field.model
                elif isinstance(field, ForeignKey):
                    current_model = field.remote_field.model

        result[lookup_path] = "many" if is_many else "one"

    return result


def reorder_subset_columns_in_df(
    df: pd.DataFrame, column_order: list[str], position=3
) -> pd.DataFrame:
    """Reorder subset of columns in dataframe to specified position."""
    valid_columns = [col for col in column_order if col in df.columns]
    all_cols = df.columns.tolist()
    remaining_cols = [col for col in all_cols if col not in valid_columns]
    new_order = remaining_cols[:position] + valid_columns + remaining_cols[position:]
    return df[new_order]


def encode_lamindb_fields_as_columns(
    registry: Registry, fields: str | list[str]
) -> str | dict[str, str]:
    """Encode laminDB specific fields in dataframe with __lamindb_{model_name}_{field_name}__.

    This is needed when reshaping dataframes with features to avoid conflicts between
    laminDB fields and feature names.
    """

    def encode(field: str) -> str:
        return f"__lamindb_{registry._meta.model_name}_{field}__"

    registry_field_names = {field.name for field in registry._meta.concrete_fields}

    if isinstance(fields, str):
        return encode(fields) if fields in registry_field_names else fields

    return {field: encode(field) for field in fields if field in registry_field_names}


# https://lamin.ai/laminlabs/lamindata/transform/BblTiuKxsb2g0003
# https://claude.ai/chat/6ea2498c-944d-4e7a-af08-29e5ddf637d2
def reshape_annotate_result(
    registry: Registry,
    df: pd.DataFrame,
    field_names: list[str],
    cols_from_include: dict[str, str] | None,
    feature_qs: QuerySet | None,
) -> pd.DataFrame:
    """Reshapes tidy table to wide format.

    Args:
        registry: The registry model (e.g., Artifact)
        df: Input dataframe with experimental data
        field_names: List of basic fields to include in result
        cols_from_include: Dict specifying additional columns to process with types
            ('one' or 'many'), e.g., {'ulabels__name': 'many', 'created_by__name': 'one'}
        feature_qs: QuerySet of features
    """
    import pandas as pd

    from lamindb.models import Artifact, Run

    cols_from_include = cols_from_include or {}

    # Initialize result with basic fields (need a copy since we're modifying it)
    result = df[field_names].copy()
    pk_name = registry._meta.pk.name

    # ========== no features requested ==========
    if feature_qs is None or not feature_qs.exists():
        if cols_from_include:
            result = process_cols_from_include(df, result, cols_from_include, pk_name)
        return result.drop_duplicates(subset=[pk_name])

    # ========== process features ==========

    # Encode Django field names to avoid conflicts with feature names
    fields_map = encode_lamindb_fields_as_columns(registry, df.columns)
    df_encoded = df.rename(columns=fields_map)
    result_encoded = result.rename(columns=fields_map)
    pk_name_encoded = fields_map.get(pk_name)  # type: ignore

    # --- Process JSON-stored feature values ---
    json_values_attribute = (
        "json_values" if registry in {Artifact, Run} else "values_json"
    )
    feature_name_col = f"{json_values_attribute}__feature__name"
    feature_value_col = f"{json_values_attribute}__value"

    if all(col in df_encoded.columns for col in [feature_name_col, feature_value_col]):
        # Separate dict and non-dict values for different aggregation strategies
        is_dict_or_list = df_encoded[feature_value_col].apply(
            lambda x: isinstance(x, (dict, list))
        )
        dict_or_list_df = df_encoded[is_dict_or_list]
        non_dict_or_list_df = df_encoded[~is_dict_or_list]

        # Aggregate: sets for non-dict values, first for dict values
        groupby_cols = [pk_name_encoded, feature_name_col]
        non_dict_or_list_features = non_dict_or_list_df.groupby(groupby_cols)[
            feature_value_col
        ].agg(set)
        dict_or_list_features = dict_or_list_df.groupby(groupby_cols)[
            feature_value_col
        ].agg("first")

        # Combine and pivot to wide format
        combined_features = pd.concat(
            [non_dict_or_list_features, dict_or_list_features]
        )
        feature_values = combined_features.unstack().reset_index()

        if not feature_values.empty:
            result_encoded = result_encoded.join(
                feature_values.set_index(pk_name_encoded),
                on=pk_name_encoded,
            )

    # --- Process categorical/linked features ---
    links_prefix = "links_" if registry in {Artifact, Run} else ("links_", "values_")
    links_features = [
        col
        for col in df.columns
        if "feature__name" in col and col.startswith(links_prefix)
    ]

    if links_features:
        result_encoded = process_links_features(
            df_encoded,
            result_encoded,
            links_features,
            feature_qs,
            pk_name_encoded,
        )

    # --- Apply type conversions based on feature metadata ---
    def extract_and_check_scalar(series: pd.Series) -> tuple[pd.Series, bool]:
        """Extract single elements and return if column is now scalar."""
        has_multiple_values = False

        def extract_and_track(value):
            nonlocal has_multiple_values
            if not hasattr(value, "__len__") or isinstance(value, str):
                return value
            if len(value) != 1:
                has_multiple_values = True
                return value
            return next(iter(value))

        extracted = series.apply(extract_and_track)
        is_scalar = not has_multiple_values
        return extracted, is_scalar

    for feature in feature_qs:
        if feature.name not in result_encoded.columns:
            continue

        result_encoded[feature.name], is_scalar = extract_and_check_scalar(
            result_encoded[feature.name]
        )

        if is_scalar:
            dtype_str = feature._dtype_str
            if dtype_str.startswith("cat"):
                result_encoded[feature.name] = result_encoded[feature.name].astype(
                    "category"
                )
            if dtype_str == "datetime":
                # format and utc args are needed for mixed data
                # pandera expects timezone-naive datetime objects, and hence,
                # we need to localize with None
                result_encoded[feature.name] = pd.to_datetime(
                    result_encoded[feature.name], format="ISO8601", utc=True
                ).dt.tz_localize(None)
            if dtype_str == "date":
                # see comments for datetime
                result_encoded[feature.name] = (
                    pd.to_datetime(
                        result_encoded[feature.name],
                        format="ISO8601",
                        utc=True,
                    )
                    .dt.tz_localize(None)
                    .dt.date
                )
            if dtype_str == "bool":
                result_encoded[feature.name] = result_encoded[feature.name].astype(
                    "boolean"
                )

        dtype_str = feature._dtype_str
        if dtype_str.startswith("list"):
            mask = result_encoded[feature.name].notna()
            result_encoded.loc[mask, feature.name] = result_encoded.loc[
                mask, feature.name
            ].apply(lambda x: list(x) if isinstance(x, (set, list)) else [x])

        if dtype_str == "dict":
            # this is the case when a dict is stored as a string; won't happen
            # within lamindb but might for external data
            if isinstance(result_encoded[feature.name].iloc[0], str):
                result_encoded[feature.name] = result_encoded[feature.name].apply(
                    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
                )

    # --- Finalize result ---

    # Reorder columns to prioritize features
    result_encoded = reorder_subset_columns_in_df(
        result_encoded,
        feature_qs.to_list("name"),  # type: ignore
    )

    # Process additional included columns
    if cols_from_include:
        cols_from_include_encoded = {
            fields_map.get(k, k): v  # type: ignore
            for k, v in cols_from_include.items()
        }
        result_encoded = process_cols_from_include(
            df_encoded, result_encoded, cols_from_include_encoded, pk_name_encoded
        )

    # Decode field names back to original, except where conflicts exist
    # (e.g., if a feature is also named 'id', keep the encoded field name)
    decode_map = {
        encoded: original
        for original, encoded in fields_map.items()  # type: ignore
        if original not in result_encoded.columns
    }

    return result_encoded.drop_duplicates(subset=[pk_name_encoded]).rename(
        columns=decode_map
    )


def process_links_features(
    df: pd.DataFrame,
    result: pd.DataFrame,
    feature_cols: list[str],
    feature_qs: QuerySet | None,
    pk_name: str = "id",
) -> pd.DataFrame:
    """Process links_XXX feature columns."""
    import pandas as pd

    from lamindb.models.feature import parse_dtype

    # this loops over different entities that might be linked under a feature
    for feature_col in feature_cols:
        links_attribute = "links_" if feature_col.startswith("links_") else "values_"
        regex = f"{links_attribute}(.+?)__feature__name"
        prefix = re.match(regex, feature_col).group(1)

        value_cols = [
            col
            for col in df.columns
            if col.startswith(f"{links_attribute}{prefix}__")
            and "feature__name" not in col
        ]

        if not value_cols:
            continue

        value_col = value_cols[0]
        feature_names = df[feature_col].unique()
        feature_names = feature_names[~pd.isna(feature_names)]

        for feature in feature_qs:
            if feature.name not in feature_names:
                continue
            if feature.name in result.columns:
                continue
            field_name = parse_dtype(feature._dtype_str)[0]["field_str"]
            value_col = [c for c in value_cols if c.endswith(f"__{field_name}")][0]
            mask = (df[feature_col] == feature.name) & df[value_col].notna()
            feature_values = df[mask].groupby(pk_name)[value_col].agg(set)
            result.insert(3, feature.name, result[pk_name].map(feature_values))

    return result


def process_cols_from_include(
    df: pd.DataFrame,
    result: pd.DataFrame,
    extra_columns: dict[str, str],
    pk_name: str = "id",
) -> pd.DataFrame:
    """Process additional columns based on their specified types."""
    for col, col_type in extra_columns.items():
        if col not in df.columns:
            continue
        if col in result.columns:
            continue

        values = df.groupby(pk_name)[col].agg(set if col_type == "many" else "first")
        result.insert(3, col, result[pk_name].map(values))

    return result


def _queryset_class_factory(
    registry: Registry, queryset_cls: type[models.QuerySet]
) -> type[models.QuerySet]:
    from lamindb.models import Artifact, ArtifactSet

    # If the model is Artifact, create a new class for BasicQuerySet or QuerySet that inherits from ArtifactSet.
    # This allows to add artifact specific functionality to all classes inheriting from BasicQuerySet.
    # Thus all query sets of artifacts (and only of artifacts) will have functions from ArtifactSet.
    if registry is Artifact and not issubclass(queryset_cls, ArtifactSet):
        new_cls = type(
            "Artifact" + queryset_cls.__name__, (queryset_cls, ArtifactSet), {}
        )
    else:
        new_cls = queryset_cls
    return new_cls


class BasicQuerySet(models.QuerySet):
    """Sets of records returned by queries.

    See Also:

        `django QuerySet <https://docs.djangoproject.com/en/stable/ref/models/querysets/>`__

    Examples:

        Any filter statement produces a query set::

            queryset = Registry.filter(name__startswith="keyword")
    """

    def __new__(cls, model=None, query=None, using=None, hints=None):
        # see comments in _queryset_class_factory
        return object.__new__(_queryset_class_factory(model, cls))

    def _to_class(
        self, cls: type[models.QuerySet], copy: bool = True
    ) -> models.QuerySet:
        qs = self.all() if copy else self
        qs.__class__ = cls
        return qs

    def _to_basic(self, copy: bool = True) -> BasicQuerySet:
        cls = _queryset_class_factory(self.model, BasicQuerySet)
        return self._to_class(cls, copy)

    def _to_non_basic(self, copy: bool = True) -> QuerySet:
        cls = _queryset_class_factory(self.model, QuerySet)
        return self._to_class(cls, copy)

    @doc_args(SQLRecord.to_dataframe.__doc__)
    def to_dataframe(
        self,
        *,
        include: str | list[str] | None = None,
        features: str | list[str] | None = None,
        limit: int | None = 100,
        order_by: str | None = "-id",
    ) -> pd.DataFrame:
        """{}"""  # noqa: D415
        import pandas as pd

        if (
            self.model.__name__ == "Artifact"
            and "kind" not in str(self.query.where)
            and self.query.low_mark
            == 0  # this should be 0, not None, it represent OFFSET = 0
            and self.query.high_mark
            is None  # this should be None, it represent _no_ LIMIT
        ):
            subset = self.exclude(**{"kind__startswith": "__lamindb"})
        else:
            subset = self
        # check if queryset is already ordered
        is_ordered = bool(subset.query.order_by)
        # Only apply order_by if not already ordered and order_by is specified
        if not is_ordered and order_by is not None:
            subset = subset.order_by(order_by)
        is_truncated = False
        if limit is not None:
            # Fetch one extra row as a sentinel to detect truncation without count().
            subset = subset[: limit + 1]
        if include is None:
            include_input = []
        elif isinstance(include, str):
            include_input = [include]
        else:
            include_input = include
        if "features" in include_input:
            include_input.remove("features")
            if features is None:
                # indicate the default features with True
                # should refactor this in the future
                features = True  # type: ignore
        features_input = [] if features is None else features
        include = get_backward_compat_filter_kwargs(subset, include_input)
        field_names = get_basic_field_names(subset, include_input, features_input)

        annotate_kwargs = {}
        filtered_relations = {}  # type: ignore
        feature_qs = None
        if features:
            feature_annotate_kwargs, feature_qs, filtered_relations = (
                get_feature_annotate_kwargs(subset.model, features, subset)
            )
            annotate_kwargs.update(feature_annotate_kwargs)
        if include_input:
            include_input = include_input.copy()[::-1]  # type: ignore
            include_kwargs = {s: F(s) for s in include_input if s not in field_names}
            annotate_kwargs.update(include_kwargs)
        if annotate_kwargs:
            id_subquery = subset.values("id")
            # for annotate, we want the queryset without filters so that joins don't affect the annotations
            query_set_without_filters = subset.model.objects.using(subset.db).filter(
                id__in=Subquery(id_subquery)
            )
            if subset.query.order_by:
                # Apply the same ordering to the new queryset
                query_set_without_filters = query_set_without_filters.order_by(
                    *subset.query.order_by
                )
            if filtered_relations:
                query_set_without_filters = query_set_without_filters.annotate(
                    **filtered_relations
                )
            queryset = query_set_without_filters.annotate(**annotate_kwargs)
        else:
            queryset = subset

        # our main problem with this approach is that we lose ordering in categorical lists
        # we'd need to respect ordering through the primary key on the link table, but that's
        # another refactoring effort
        # we have the correct ordering in `features.get_values()`, though
        df = pd.DataFrame(queryset.values(*field_names, *list(annotate_kwargs.keys())))
        if limit is not None and len(df) > limit:
            is_truncated = True
            df = df.iloc[:limit].copy()
        if len(df) == 0:
            df = pd.DataFrame({}, columns=field_names)
            return df
        cols_from_include = analyze_lookup_cardinality(self.model, include_input)  # type: ignore
        df_reshaped = reshape_annotate_result(
            self.model, df, field_names, cols_from_include, feature_qs
        )
        pk_name = self.model._meta.pk.name
        encoded_pk_name = encode_lamindb_fields_as_columns(self.model, pk_name)
        if encoded_pk_name in df_reshaped.columns:
            df_reshaped = df_reshaped.set_index(encoded_pk_name)
        else:
            pk_column_name = pk_name if pk_name in df.columns else f"{pk_name}_id"
            if pk_column_name in df_reshaped.columns:
                df_reshaped = df_reshaped.set_index(pk_column_name)

        # cast floats and ints where appropriate
        # this is currently needed because the UI writes into the JSON field through JS
        # and thus a `10` might be a float, not an int
        # note: also type casting within reshape_annotate_result
        if feature_qs is not None:
            for feature in feature_qs:
                if feature.name in df_reshaped.columns:
                    current_dtype = df_reshaped[feature.name].dtype
                    dtype_str = feature._dtype_str
                    if dtype_str == "int" and not pd.api.types.is_integer_dtype(
                        current_dtype
                    ):
                        df_reshaped[feature.name] = df_reshaped[feature.name].astype(
                            "Int64"  # nullable integer dtype
                        )
                    elif dtype_str == "float" and not pd.api.types.is_float_dtype(
                        current_dtype
                    ):
                        df_reshaped[feature.name] = df_reshaped[feature.name].astype(
                            float
                        )
        if is_truncated:
            logger.warning(
                f"truncated query result to limit={limit} {self.model.__name__} objects"
            )
        return df_reshaped

    @deprecated(new_name="to_dataframe")
    def df(
        self,
        include: str | list[str] | None = None,
        features: bool | list[str] | str | None = None,
    ) -> pd.DataFrame:
        return self.to_dataframe(include=include, features=features)

    def describe(self, return_str: bool = False) -> str | None:
        """Describe the query set to learn about available fields."""
        return self.model.describe(return_str=return_str)

    def delete(self, *args, permanent: bool | None = None, **kwargs):
        """Delete all records in the query set.

        Args:
            permanent: Whether to permanently delete the record (skips trash).
                Is only relevant for records that have the `branch` field.
                If `None`, uses soft delete for records that have the `branch` field, hard delete otherwise.

        Note:
            Calling `delete()` twice on the same queryset does NOT permanently delete in bulk operations.
            Use `permanent=True` for actual deletion.

        Examples:

            For a `QuerySet` object `qs`, call::

                qs.delete()
        """
        from lamindb.models import Artifact, Collection, Run, Storage, Transform

        if self.model is Run:
            if permanent is True:
                from .run import _permanent_delete_runs

                _permanent_delete_runs(self)
                return
            if permanent is not True:
                self.update(branch_id=-1)
                return
        if self.model is Transform:
            if permanent is True:
                from .transform import _permanent_delete_transforms

                _permanent_delete_transforms(self)
                return
            if permanent is not True:
                _adjust_is_latest_when_deleting_is_versioned(self)
                self.update(branch_id=-1, is_latest=False)
                return
        # Artifact, Collection: non-trivial delete behavior, handle in a loop
        if self.model in {Artifact, Collection}:
            for record in self:
                record.delete(*args, permanent=permanent, **kwargs)
        elif self.model is Storage:  # storage does not have soft delete
            if permanent is False:
                raise ValueError(
                    "Soft delete is not possible for Storage, "
                    "use 'permanent=True' or 'permanent=None' for permanent deletion."
                )
            for record in self:
                record.delete()
        else:
            if not permanent and hasattr(self.model, "branch_id"):
                logger.warning("moved records to trash (branch_id = -1)")
                self.update(branch_id=-1)
            else:
                if permanent is False:
                    raise ValueError(
                        f"Soft delete is not possible for {self.model.__name__}, "
                        "use 'permanent=True' for permanent deletion."
                    )
                super().delete(*args, **kwargs)

    def to_list(self, field: str | None = None) -> list[SQLRecord] | list[str]:
        """Populate an (unordered) list with the results.

        Note that the order in this list is only meaningful if you ordered the underlying query set with `.order_by()`.

        Examples::

            queryset.to_list()  # list of records
            queryset.to_list("name")  # list of values
        """
        if field is None:
            return list(self)
        else:
            # list casting is necessary because values_list does not return a list
            return list(self.values_list(field, flat=True))

    def first(self) -> SQLRecord | None:
        """If non-empty, the first result in the query set, otherwise ``None``.

        Examples::

            queryset.first()
        """
        if len(self) == 0:
            return None
        return self[0]

    def one(self) -> SQLRecord:
        """Exactly one result. Raises error if there are more or none."""
        return one_helper(self)

    def one_or_none(self) -> SQLRecord | None:
        """At most one result. Returns it if there is one, otherwise returns ``None``.

        Examples::

            ULabel.filter(name="benchmark").one_or_none()
            ULabel.filter(name="non existing label").one_or_none()
        """
        return one_helper(self, raise_doesnotexist=False)

    @doc_args(_search.__doc__)
    def search(self, string: str, **kwargs):
        """{}"""  # noqa: D415
        return _search(cls=self, string=string, **kwargs)

    @doc_args(_lookup.__doc__)
    def lookup(self, field: StrField | None = None, **kwargs) -> NamedTuple:
        """{}"""  # noqa: D415
        return _lookup(cls=self, field=field, **kwargs)

    # -------------------------------------------------------------------------------------
    # CanCurate
    # -------------------------------------------------------------------------------------

    @doc_args(CanCurate.validate.__doc__)
    def validate(self, values: ListLike, field: str | StrField | None = None, **kwargs):
        """{}"""  # noqa: D415
        return _validate(cls=self, values=values, field=field, **kwargs)

    @doc_args(CanCurate.inspect.__doc__)
    def inspect(self, values: ListLike, field: str | StrField | None = None, **kwargs):
        """{}"""  # noqa: D415
        return _inspect(cls=self, values=values, field=field, **kwargs)

    @doc_args(CanCurate.standardize.__doc__)
    def standardize(
        self, values: Iterable, field: str | StrField | None = None, **kwargs
    ):
        """{}"""  # noqa: D415
        return _standardize(cls=self, values=values, field=field, **kwargs)


# this differs from BasicQuerySet only in .filter and .get
# QueryManager returns BasicQuerySet because it is problematic to redefine .filter and .get
# for a query set used by the default manager
class QuerySet(BasicQuerySet):
    """Sets of records returned by queries.

    Implements additional filtering capabilities.

    See Also:

        `django QuerySet <https://docs.djangoproject.com/en/4.2/ref/models/querysets/>`__

    Examples:

        >>> ULabel(name="my label").save()
        >>> queryset = ULabel.filter(name="my label")
        >>> queryset # an instance of QuerySet
    """

    def _handle_unknown_field(self, error: FieldError) -> None:
        """Suggest available fields if an unknown field was passed."""
        if "Cannot resolve keyword" in str(error):
            field = str(error).split("'")[1]
            avail_fields = self.model.__get_available_fields__()
            fields = ", ".join(sorted(avail_fields))
            raise FieldError(
                f"Unknown field '{field}'. Available fields: {fields}"
            ) from None
        raise error  # pragma: no cover

    def get(self, idlike: int | str | None = None, **expressions) -> SQLRecord:
        """Query a single record. Raises error if there are more or none."""
        is_run_input = expressions.pop("is_run_input", False)

        # artifacts_from_path and get accept only BasicQuerySet
        qs = self._to_class(BasicQuerySet, copy=True)

        if path := expressions.pop("path", None):
            from .artifact_set import ArtifactSet, artifacts_from_path

            if not isinstance(self, ArtifactSet):
                raise ValueError("Querying by path is only possible for artifacts.")
            qs = artifacts_from_path(qs, path)

        try:
            record = get(qs, idlike, **expressions)
        except ValueError as e:
            # Pass through original error for explicit id lookups
            if "Field 'id' expected a number" in str(e):
                if "id" in expressions:
                    raise
                field = next(iter(expressions))
                raise FieldError(
                    f"Invalid lookup '{expressions[field]}' for {field}. Did you mean {field}__name?"
                ) from None
            raise  # pragma: no cover
        except FieldError as e:
            self._handle_unknown_field(e)
            raise  # pragma: no cover

        if is_run_input is not False:  # might be None or True or Run
            from .artifact import Artifact, track_run_input
            from .collection import Collection

            if isinstance(record, (Artifact, Collection)):
                track_run_input(record, is_run_input)

        return record

    def filter(self, *queries, **expressions) -> QuerySet:
        """Query a set of records."""
        from lamindb.models import Artifact, Record, Run

        from .feature import FeaturePredicate

        feature_predicates = [q for q in queries if isinstance(q, FeaturePredicate)]
        queries = tuple(q for q in queries if not isinstance(q, FeaturePredicate))
        registry = self.model
        is_status_filter_on_run = registry is Run and any(
            key.split("__")[0] == "status" for key in expressions
        )
        can_filter_with_features = registry in {
            Artifact,
            Run,
            Record,
        }
        if (
            not expressions.pop("_skip_filter_with_features", False)
            and can_filter_with_features
            and not is_status_filter_on_run
        ):
            from ._feature_manager import filter_with_features

            qs = filter_with_features(self, *queries, **expressions)
        else:
            # Suggest to use __name for related fields such as id when not passed
            for field, value in expressions.items():
                if (
                    isinstance(value, str)
                    and value.strip("-").isalpha()
                    and "__" not in field
                    and hasattr(registry, field)
                ):
                    field_attr = getattr(registry, field)
                    if hasattr(field_attr, "field") and field_attr.field.related_model:
                        raise FieldError(
                            f"Invalid lookup '{value}' for {field}. Did you mean {field}__name?"
                        )
            expressions = process_expressions(self, queries, expressions)
            # need to run a query if queries or expressions are not empty
            if queries or expressions:
                try:
                    qs = super().filter(*queries, **expressions)
                except FieldError as e:
                    self._handle_unknown_field(e)
            else:
                qs = self
        if feature_predicates:
            if not can_filter_with_features:
                raise FieldError(
                    f"Feature predicates are only supported for Artifact, Run, and Record, not {registry.__name__}."
                )
            from ._feature_manager import filter_with_feature_predicates

            # Run predicate translation on a BasicQuerySet clone.
            # - `copy=True` avoids mutating `qs.__class__` in place while we temporarily
            #   switch query set type for this translation phase.
            # - We intentionally do not use `_skip_filter_with_features` here: that flag
            #   guards the QuerySet.filter() feature dispatcher path, while this code
            #   bypasses that dispatcher and executes predicate translation directly.
            qs = filter_with_feature_predicates(
                qs._to_class(BasicQuerySet, copy=True), feature_predicates
            )._to_class(type(qs), copy=False)
        return qs


@final
class NonInstantiableQuerySet:
    """Wrapper around QuerySet that prevents instantiation while preserving query methods."""

    def __init__(self, qs: QuerySet, registry_name: str):
        self._qs = qs
        self._name = registry_name

    def __repr__(self) -> str:
        return f"<QuerySet [{self._name}]>"

    def __call__(self, *args, **kwargs):
        raise TypeError(
            f"Cannot instantiate {self._name} from DB. "
            f"Use {self._name}.filter(), {self._name}.get(), etc. to query records."
        )

    def __getattr__(self, attr):
        return getattr(self._qs, attr)


class ModuleNamespace:
    """Namespace for accessing registries from a specific schema module.

    Args:
        query_db: Parent DB instance.
        module_name: Name of the schema module (e.g., 'bionty', 'pertdb').
    """

    def __init__(self, query_db: DB, module_name: str):
        self._query_db = query_db
        self._module_name = module_name
        self._cache: dict[str, NonInstantiableQuerySet] = {}

    def __getattr__(self, name: str) -> NonInstantiableQuerySet:
        """Access a registry class from this schema module.

        Args:
            name: Registry class name (e.g., 'Gene', 'CellType').

        Returns:
            QuerySet for the specified registry scoped to the parent instance.
        """
        if name in self._cache:
            return self._cache[name]

        try:
            schema_module = import_module(self._module_name)
            if hasattr(schema_module, name):
                model_class = getattr(schema_module, name)
                queryset = model_class.connect(self._query_db._instance)
                wrapped = NonInstantiableQuerySet(queryset, name)
                self._cache[name] = wrapped
                return wrapped
        except (ImportError, AttributeError):
            pass

        raise AttributeError(
            f"Registry '{name}' not found in lamindb. Use .bt.{name} or .pertdb.{name} for schema-specific registries."
        )

    def __dir__(self) -> list[str]:
        """Return list of available registries in this schema module."""
        base_attrs = [attr for attr in object.__dir__(self) if not attr.startswith("_")]
        try:
            schema_module = import_module(self._module_name)
            if hasattr(schema_module, "__all__"):
                registries = set()
                for class_name in schema_module.__all__:
                    model_class = getattr(schema_module, class_name, None)
                    if model_class and hasattr(model_class, "connect"):
                        registries.add(class_name)
                return sorted(set(base_attrs) | registries)
        except ImportError:
            pass
        return base_attrs


class BiontyDB(ModuleNamespace):
    """Namespace for Bionty registries (Gene, CellType, Disease, etc.)."""

    Gene: QuerySet[Gene]  # type: ignore[type-arg]
    Protein: QuerySet[Protein]  # type: ignore[type-arg]
    CellType: QuerySet[CellType]  # type: ignore[type-arg]
    Disease: QuerySet[Disease]  # type: ignore[type-arg]
    Phenotype: QuerySet[Phenotype]  # type: ignore[type-arg]
    Pathway: QuerySet[Pathway]  # type: ignore[type-arg]
    Tissue: QuerySet[Tissue]  # type: ignore[type-arg]
    CellLine: QuerySet[CellLine]  # type: ignore[type-arg]
    CellMarker: QuerySet[CellMarker]  # type: ignore[type-arg]
    Organism: QuerySet[Organism]  # type: ignore[type-arg]
    ExperimentalFactor: QuerySet[ExperimentalFactor]  # type: ignore[type-arg]
    DevelopmentalStage: QuerySet[DevelopmentalStage]  # type: ignore[type-arg]
    Ethnicity: QuerySet[Ethnicity]  # type: ignore[type-arg]


class PertdbDB(ModuleNamespace):
    """Namespace for `PertDB` registries (Biologic, Compound, etc.)."""

    Biologic: QuerySet[Biologic]  # type: ignore[type-arg]
    Compound: QuerySet[Compound]  # type: ignore[type-arg]
    CompoundPerturbation: QuerySet[CompoundPerturbation]  # type: ignore[type-arg]
    GeneticPerturbation: QuerySet[GeneticPerturbation]  # type: ignore[type-arg]
    EnvironmentalPerturbation: QuerySet[EnvironmentalPerturbation]  # type: ignore[type-arg]
    CombinationPerturbation: QuerySet[CombinationPerturbation]  # type: ignore[type-arg]
    PerturbationTarget: QuerySet[PerturbationTarget]  # type: ignore[type-arg]


class DB:
    """Query any registry of any instance.

    Args:
        instance: Instance identifier in format "account/instance".

    Examples:

        Query objects from an instance::

            db = ln.DB("laminlabs/cellxgene")

        Query artifacts and filter by `suffix`::

            db.Artifact.filter(suffix=".h5ad").to_dataframe()

        Get a single artifact by uid::

            artifact = db.Artifact.get("abcDEF123456")

        Query records and filter by name::

            db.Record.filter(name__startswith="sample").to_dataframe()

        Get a cell type object::

            t_cell = db.bionty.CellType.get(name="T cell")

        Create a lookup object to auto-complete all cell types in the database::

            cell_types = db.bionty.CellType.lookup()

        Return a `DataFrame` with additional info::

            db.Artifact.filter(
                suffix=".h5ad",
                description__contains="immune",
                size__gt=1e9,  # size > 1GB
                cell_types__name__in=["B cell", "T cell"],
            ).order_by("created_at").to_dataframe(
                include=["cell_types__name", "created_by__handle"]  # include additional info
            ).head()
    """

    Artifact: QuerySet[Artifact]  # type: ignore[type-arg]
    Collection: QuerySet[Collection]  # type: ignore[type-arg]
    Transform: QuerySet[Transform]  # type: ignore[type-arg]
    Run: QuerySet[Run]  # type: ignore[type-arg]
    User: QuerySet[User]  # type: ignore[type-arg]
    Storage: QuerySet[Storage]  # type: ignore[type-arg]
    Feature: QuerySet[Feature]  # type: ignore[type-arg]
    ULabel: QuerySet[ULabel]  # type: ignore[type-arg]
    Record: QuerySet[Record]  # type: ignore[type-arg]
    Schema: QuerySet[Schema]  # type: ignore[type-arg]
    Project: QuerySet[Project]  # type: ignore[type-arg]
    Reference: QuerySet[Reference]  # type: ignore[type-arg]
    Branch: QuerySet[Branch]  # type: ignore[type-arg]
    Space: QuerySet[Space]  # type: ignore[type-arg]

    bionty: BiontyDB
    pertdb: PertdbDB

    def __init__(self, instance: str):
        self._instance = instance
        self._cache: dict[str, NonInstantiableQuerySet | BiontyDB | PertdbDB] = {}
        self._available_registries: set[str] | None = None

        owner, instance_name = (
            ln_setup._connect_instance.get_owner_name_from_identifier(instance)
        )
        instance_info = ln_setup._connect_instance._connect_instance(
            owner=owner, name=instance_name
        )
        self._modules = ["lamindb"] + list(instance_info.modules)

    def __getattr__(self, name: str) -> NonInstantiableQuerySet | BiontyDB | PertdbDB:
        """Access a registry class or schema namespace for this database instance.

        Args:
            name: Registry class name (e.g., 'Artifact', 'Collection') or schema namespace ('bionty', 'pertdb').

        Returns:
            QuerySet for the specified registry or schema namespace scoped to this instance.
        """
        if name in self._cache:
            return self._cache[name]

        if name == "bionty":
            if "bionty" not in self._modules:
                raise AttributeError(
                    f"Schema 'bionty' not available in instance '{self._instance}'."
                )
            if "bionty" not in self._cache:
                namespace = BiontyDB(self, "bionty")
                self._cache["bionty"] = namespace
            return self._cache["bionty"]

        if name == "pertdb":
            if "pertdb" not in self._modules:
                raise AttributeError(
                    f"Schema 'pertdb' not available in instance '{self._instance}'."
                )
            if "pertdb" not in self._cache:
                namespace = PertdbDB(self, "pertdb")  # type: ignore
                self._cache["pertdb"] = namespace
            return self._cache["pertdb"]

        try:
            lamindb_module = import_module("lamindb")
            if hasattr(lamindb_module, name):
                model_class = getattr(lamindb_module, name)
                queryset = model_class.connect(self._instance)
                wrapped = NonInstantiableQuerySet(queryset, name)
                self._cache[name] = wrapped
                return wrapped
        except (ImportError, AttributeError):
            pass

        raise AttributeError(
            f"Registry '{name}' not found in lamindb core registries. Use .bionty.{name} or .pertdb.{name} for schema-specific registries."
        )

    def __repr__(self) -> str:
        return f"DB('{self._instance}')"

    def __dir__(self) -> list[str]:
        """Return list of available registries and schema namespaces."""
        base_attrs = [attr for attr in super().__dir__() if not attr.startswith("_")]

        lamindb_registries = set()
        try:
            lamindb_module = import_module("lamindb")
            if hasattr(lamindb_module, "__all__"):
                for class_name in lamindb_module.__all__:
                    model_class = getattr(lamindb_module, class_name, None)
                    if model_class and hasattr(model_class, "connect"):
                        lamindb_registries.add(class_name)
        except ImportError:
            pass

        module_namespaces = set()
        if "bionty" in self._modules:
            module_namespaces.add("bionty")
        if "pertdb" in self._modules:
            module_namespaces.add("pertdb")

        return sorted(set(base_attrs) | lamindb_registries | module_namespaces)


================================================
FILE: lamindb/models/record.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Any, overload

import pgtrigger
from django.conf import settings as django_settings
from django.db import models
from django.db.models import CASCADE, PROTECT
from lamin_utils import logger

from lamindb.base.fields import (
    CharField,
    DateTimeField,
    ForeignKey,
    JSONField,
    TextField,
)
from lamindb.base.utils import class_and_instance_method, strict_classmethod
from lamindb.errors import FieldValidationError

from ..base.uids import base62_16
from .artifact import Artifact
from .can_curate import CanCurate
from .collection import Collection
from .feature import Feature, convert_to_pandas_dtype
from .has_parents import HasParents, _query_relatives
from .query_set import (
    QuerySet,
    encode_lamindb_fields_as_columns,
    get_default_branch_ids,
    reorder_subset_columns_in_df,
)
from .run import Run, TracksRun, TracksUpdates, User, current_run, current_user_id
from .sqlrecord import BaseSQLRecord, HasType, IsLink, SQLRecord, _get_record_kwargs
from .transform import Transform
from .ulabel import ULabel

if TYPE_CHECKING:
    from datetime import datetime

    import pandas as pd

    from ._feature_manager import FeatureManager
    from .block import RecordBlock
    from .project import Project, RecordProject, RecordReference, Reference
    from .query_manager import RelatedManager
    from .query_set import SQLRecordList
    from .schema import Schema


# keep docstring in sync with test_record_docstring_examples in test_record_basics.py
IMPORTS_UID = "W3WdiFRZTvTJajNp"
SCHEMA_IMPORTS_UID = "DGZkj4yhGWMJE5fu"


class RecordBatch:
    """DataFrame-backed batch created by :meth:`Record.from_dataframe`."""

    def __init__(
        self,
        *,
        cls: type[Record],
        df: pd.DataFrame,
        resolved_type: Record,
        name_field: str,
    ) -> None:
        self._cls = cls
        self._df = df
        self._resolved_type = resolved_type
        self._name_field = name_field
        self._records: list[Record] | None = None

    def __len__(self) -> int:
        return len(self._df)

    @property
    def type(self) -> Record:
        return self._resolved_type

    def _build_records(self) -> list[Record]:
        import pandas as pd

        records: list[Record] = []
        row_dicts = self._df.to_dict(orient="records")
        for row in row_dicts:
            if self._name_field in row:
                name = row.pop(self._name_field)
            elif "name" in row:
                name = row.pop("name")
            else:
                name = None
            if pd.api.types.is_scalar(name) and pd.isna(name):
                name = None

            features: dict[str, Any] = {}
            for key, value in row.items():
                if pd.api.types.is_scalar(value) and pd.isna(value):
                    continue
                features[key] = value

            record_kwargs: dict[str, Any] = {"type": self._resolved_type}
            if features:
                record_kwargs["features"] = features
            records.append(self._cls(name=name, **record_kwargs))
        return records

    def save(self) -> SQLRecordList[Record]:
        """Persist all records and their feature values."""
        from .query_set import SQLRecordList
        from .save import save as ln_save

        if self._records is None:
            self._records = self._build_records()
        ln_save(self._records)
        return SQLRecordList(self._records)


class Record(SQLRecord, HasType, HasParents, CanCurate, TracksRun, TracksUpdates):
    """Flexible records with sheets & markdown pages.

    Useful for managing samples, donors, cells, compounds, sequences, and other custom entities with their features.

    If you just want a simple label, use :class:`~lamindb.ULabel`.

    Args:
        name: `str | None = None` A name.
        description: `str | None = None` A description.
        type: `Record | None = None` The type of this record.
        is_type: `bool = False` Whether this record is a type (a record that
            classifies other records).
        features: `dict[str | Feature, Any] | None = None` Lazy feature values
            to persist on `.save()` or `ln.save([...])`.
        schema: `Schema | None = None` A schema defining allowed features for records of this type. Only applicable when `is_type=True`.
        reference: `str | None = None` For instance, an external ID or a URL.
        reference_type: `str | None = None` For instance, `"url"`.

    See Also:
        :class:`~lamindb.Feature`
            Dimensions of measurement (e.g. column of a sheet, attribute of a record).
        :class:`~lamindb.ULabel`
            Like `Record`, just without the ability to store features.

    Examples:

        Create a **record** with a single feature::

            # create a feature if you don't yet have one
            gc_content = ln.Feature(name="gc_content", dtype=float).save()

            # create a record to track a sample
            sample1 = ln.Record(name="Sample 1", features={"gc_content": 0.5}).save()

            # describe the record
            sample1.describe()

        Group several records under a **record type**, optionally constrained with a :class:`~lamindb.Schema`::

            # create a flexible record type to track experiments
            experiment_type = ln.Record(name="Experiment", is_type=True).save()
            experiment1 = ln.Record(name="Experiment 1", type=experiment_type).save()

            # create a feature to link experiments
            experiment = ln.Feature(name="experiment", dtype=experiment_type).save()

            # create a record type to track samples -- constrain it with a schema
            schema = ln.Schema([experiment, gc_content.with_config(optional=True)], name="sample_schema").save()
            sample_sheet = ln.Record(name="Sample Sheet", is_type=True, schema=schema).save()

            # group the sample1 record under the sample sheet
            sample1.type = sample_sheet
            sample1.save()

            # reset the feature values for the record including the experiment
            sample1.features.set_values({
                "gc_content": 0.5,
                "experiment": "Experiment 1",  # automatically resolves by name, also accepts the experiment1 object
            })

        Export all records under a type to a dataframe::

            experiment_type.to_dataframe()
            #> __lamindb_record_name__   ...
            #>            Experiment 1   ...
            #>            Experiment 2   ...

        Import records from a dataframe :meth:`~lamindb.Record.from_dataframe`::

            records = ln.Record.from_dataframe(df, type="my_df").save()  # creates a type my_df with inferred schema

        If you try to set incomplete features in a record in a sheet, you'll get a validation error::

            sample2 = ln.Record(name="Sample 2", type=sample_sheet).save()
            sample2.features.set_values({"gc_content": 0.6})  # raises ValidationError because experiment is missing

        Query records by features::

            ln.Record.filter(gc_content=0.55)     # exact match
            ln.Record.filter(gc_content__gt=0.5)  # greater than
            ln.Record.filter(type=sample_sheet)   # just the record on the sheet

        If your feature names are ambiguous, you can use a `Feature` object to disambiguate::

            # to set feature values
            sample1.features.set_values({gc_content: 0.5})  # gc_content is the feature object

            # to query by feature values
            ln.Record.filter(gc_content == 0.5)  # instead of gc_content=0.5

        You can edit records like spreadsheets on the hub:

        .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/XSzhWUb0EoHOejiw0001.png
            :width: 800px

        Just like for :class:`~lamindb.ULabel`, you can also model **ontologies** through the `parents`/`children` attributes.

    .. dropdown:: What is the difference between `Record` and `SQLRecord`?

        The features of a `Record` are flexible: you can dynamically define features and add features to a record.
        The fields of a `SQLRecord` are static: you need to define them in code and then migrate the underlying database.

        See :class:`~lamindb.models.SQLRecord` or the glossary for more information: :term:`docs:record`.

    """

    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):
        abstract = False
        app_label = "lamindb"
        if (
            django_settings.DATABASES.get("default", {}).get("ENGINE")
            == "django.db.backends.postgresql"
        ):
            triggers = [
                pgtrigger.Trigger(
                    name="prevent_record_type_cycle",
                    operation=pgtrigger.Update | pgtrigger.Insert,
                    when=pgtrigger.Before,
                    condition=pgtrigger.Condition("NEW.type_id IS NOT NULL"),
                    func="""
                        -- Check for direct self-reference
                        IF NEW.type_id = NEW.id THEN
                            RAISE EXCEPTION 'Cannot set type: record cannot be its own type';
                        END IF;

                        -- Check for cycles in the type chain
                        IF EXISTS (
                            WITH RECURSIVE type_chain AS (
                                SELECT type_id, 1 as depth
                                FROM lamindb_record
                                WHERE id = NEW.type_id

                                UNION ALL

                                SELECT r.type_id, tc.depth + 1
                                FROM lamindb_record r
                                INNER JOIN type_chain tc ON r.id = tc.type_id
                                WHERE tc.depth < 100
                            )
                            SELECT 1 FROM type_chain WHERE type_id = NEW.id
                        ) THEN
                            RAISE EXCEPTION 'Cannot set type: would create a cycle';
                        END IF;

                        RETURN NEW;
                    """,
                ),
            ]
        # also see raw SQL constraints for `is_type` and `type` FK validity in migrations

    _name_field: str = "name"

    id: int = models.AutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(
        editable=False, unique=True, db_index=True, max_length=16, default=base62_16
    )
    """A universal random id, valid across DB instances."""
    name: str = CharField(max_length=150, db_index=True, null=True)
    """Name or title of record (optional)."""
    type: Record | None = ForeignKey("self", PROTECT, null=True, related_name="records")
    """Type of record, e.g., `Sample`, `Donor`, `Cell`, `Compound`, `Sequence` ← :attr:`~lamindb.Record.records`.

    Allows to group records by type, e.g., all samples, all donors, all cells, all compounds, all sequences.
    """
    records: RelatedManager[Record]
    """If a `type` (`is_type=True`), records of this `type`."""
    description: str | None = TextField(null=True)
    """A description."""
    reference: str | None = CharField(max_length=255, db_index=True, null=True)
    """A simple reference like a URL or external ID."""
    reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
    """Type of simple reference."""
    extra_data: dict | None = models.JSONField(null=True)
    """Additional data in JSON format, not validated as features."""
    schema: Schema | None = ForeignKey(
        "Schema", CASCADE, null=True, related_name="records"
    )
    """A schema to enforce for a type ← :attr:`~lamindb.Schema.records`.

    This is analogous to the `schema` attribute of an `Artifact`.
    If `is_type` is `True`, the schema is used to enforce features for each record of this type.
    """
    linked_records: RelatedManager[Record] = models.ManyToManyField(
        "Record",
        through="RecordRecord",
        symmetrical=False,
        related_name="linked_in_records",
    )
    """Records linked in this record as a value ← :attr:`~lamindb.Record.linked_in_records`."""
    linked_in_records: RelatedManager[Record]
    """Records linking this record as a value. Is reverse accessor for `linked_records`."""
    parents: RelatedManager[Record] = models.ManyToManyField(
        "self", symmetrical=False, related_name="children"
    )
    """Ontological parents of this record ← :attr:`~lamindb.Record.children`.

    You can build an ontology under a given `type`. For example, introduce a type `CellType` and model the hiearchy of cell types under it via `parents` and `children`.
    """
    children: RelatedManager[Record]
    """Ontological children of this record. Is reverse accessor for `parents`."""
    # this is handled manually here because we want to se the related_name attribute
    # (this doesn't happen via inheritance of TracksRun, everything else is the same)
    run: Run | None = ForeignKey(
        Run,
        PROTECT,
        related_name="output_records",
        null=True,
        default=current_run,
        editable=False,
    )
    """Run that created the record ← :attr:`~lamindb.Run.output_records`."""
    input_of_runs: RelatedManager[Run] = models.ManyToManyField(
        Run, related_name="input_records"
    )
    """Runs that use this record as an input ← :attr:`~lamindb.Run.input_records`."""
    artifacts: RelatedManager[Artifact] = models.ManyToManyField(
        Artifact, through="ArtifactRecord", related_name="records"
    )
    """Artifacts annotated by this record ← :attr:`~lamindb.Artifact.records`."""
    runs: RelatedManager[Run] = models.ManyToManyField(
        Run, through="RunRecord", related_name="records"
    )
    """Runs annotated by this record ← :attr:`~lamindb.Run.records`."""
    transforms: RelatedManager[Transform] = models.ManyToManyField(
        Transform, through="TransformRecord", related_name="records"
    )
    """Transforms annotated by this record ← :attr:`~lamindb.Transform.records`."""
    collections: RelatedManager[Collection] = models.ManyToManyField(
        Collection, through="CollectionRecord", related_name="records"
    )
    """Collections annotated by this record ← :attr:`~lamindb.Collection.records`."""
    projects: RelatedManager[Project]
    """Projects that annotate this record ← :attr:`~lamindb.Project.records`."""
    references: RelatedManager[Reference]
    """References that annotate this record ← :attr:`~lamindb.Reference.records`."""
    linked_transforms: RelatedManager[Transform]
    """Transforms linked in this record as values ← :attr:`~lamindb.Transform.linked_in_records`."""
    linked_runs: RelatedManager[Run]
    """Runs linked in this record as values ← :attr:`~lamindb.Run.linked_in_records`."""
    linked_ulabels: RelatedManager[ULabel]
    """ULabels linked in this record as values ← :attr:`~lamindb.ULabel.linked_in_records`."""
    linked_artifacts: RelatedManager[Artifact]
    """Artifacts linked in this record as values ← :attr:`~lamindb.Artifact.linked_in_records`."""
    linked_projects: RelatedManager[Project]
    """Projects linked in this record as values ← :attr:`~lamindb.Project.linked_in_records`."""
    linked_references: RelatedManager[Reference]
    """References linked in this record as values ← :attr:`~lamindb.Reference.linked_in_records`."""
    linked_collections: RelatedManager[Collection]
    """Collections linked in this record as values ← :attr:`~lamindb.Collection.linked_in_records`."""
    linked_users: RelatedManager[User]
    """Users linked in this record as values ← :attr:`~lamindb.User.linked_in_records`."""
    ablocks: RelatedManager[RecordBlock]
    """Attached blocks ← :attr:`~lamindb.RecordBlock.record`."""
    values_json: RelatedManager[RecordJson]
    """JSON values `(record_id, feature_id, value)`."""
    values_record: RelatedManager[RecordRecord]
    """Record values with their features `(record_id, feature_id, value_id)`."""
    values_ulabel: RelatedManager[RecordULabel]
    """ULabel values with their features `(record_id, feature_id, value_id)`."""
    values_user: RelatedManager[RecordUser]
    """User values with their features `(record_id, feature_id, value_id)`."""
    values_transform: RelatedManager[RecordTransform]
    """Transform values with their features `(record_id, feature_id, value_id)`."""
    values_run: RelatedManager[RecordRun]
    """Run values with their features `(record_id, feature_id, value_id)`."""
    values_artifact: RelatedManager[RecordArtifact]
    """Artifact values with their features `(record_id, feature_id, value_id)`."""
    values_collection: RelatedManager[RecordCollection]
    """Collection values with their features `(record_id, feature_id, value_id)`."""
    values_reference: RelatedManager[RecordReference]
    """Reference values with their features `(record_id, feature_id, value_id)`."""
    values_project: RelatedManager[RecordProject]
    """Project values with their features `(record_id, feature_id, value_id)`."""

    @overload
    def __init__(
        self,
        name: str | None = None,
        type: Record | None = None,
        is_type: bool = False,
        features: dict[str | Feature, Any] | None = None,
        description: str | None = None,
        schema: Schema | None = None,
        reference: str | None = None,
        reference_type: str | None = None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        if len(args) == len(self._meta.concrete_fields):
            super().__init__(*args, **kwargs)
            return None
        if len(args) > 0:
            raise ValueError("Only one non-keyword arg allowed")
        name: str = kwargs.pop("name", None)
        type: str | None = kwargs.pop("type", None)
        is_type: bool = kwargs.pop("is_type", False)
        features: dict[str | Feature, Any] | None = kwargs.pop("features", None)
        description: str | None = kwargs.pop("description", None)
        schema: Schema | None = kwargs.pop("schema", None)
        reference: str | None = kwargs.pop("reference", None)
        reference_type: str | None = kwargs.pop("reference_type", None)
        branch = kwargs.pop("branch", None)
        branch_id = kwargs.pop("branch_id", 1)
        space = kwargs.pop("space", None)
        space_id = kwargs.pop("space_id", 1)
        _skip_validation = kwargs.pop("_skip_validation", False)
        _aux = kwargs.pop("_aux", None)
        if len(kwargs) > 0:
            valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Record)])
            raise FieldValidationError(
                f"Only {valid_keywords} are valid keyword arguments"
            )
        if schema and not is_type:
            logger.important("passing schema, treating as type")
            is_type = True
        if features is not None:
            self._features = features
        super().__init__(
            name=name,
            type=type,
            is_type=is_type,
            description=description,
            reference=reference,
            reference_type=reference_type,
            schema=schema,
            branch=branch,
            branch_id=branch_id,
            space=space,
            space_id=space_id,
            _skip_validation=_skip_validation,
            _aux=_aux,
        )

    def save(self, *args, **kwargs) -> Record:
        super().save(*args, **kwargs)
        if hasattr(self, "_features"):
            pending_features = self._features
            self.features.add_values(pending_features)
            del self._features
        return self

    @strict_classmethod
    def from_dataframe(
        cls,
        df: pd.DataFrame,
        *,
        type: Record | str,
        name_field: str = "__lamindb_record_name__",
    ) -> RecordBatch:
        """Construct a dataframe-backed batch of records for bulk saving.

        Returns a :class:`RecordBatch`. Follow with `records.save()`.

        Args:
            df: A dataframe where rows represent records.
            type: Record type for all rows as either a `Record` object or a
                string. If passing a string, a new type with that name is created
                under `Imports` with an inferred schema from the dataframe.
                If that type name already exists, raise an error and pass an
                existing `Record` object for reuse.
                If the resolved type is a sheet (`type.schema is not None`), feature
                values are validated against that schema at save time.
            name_field: Column used for record names. Falls back to `name` if
                absent. If neither exists, records are created without names.

        Examples:

            Create a new type and import records::

                records = ln.Record.from_dataframe(df, type="my_df").save()

            Import records into an existing type::

                records = ln.Record.from_dataframe(df, type=sample_sheet).save()

        """
        import pandas as pd

        from .schema import Schema

        if not isinstance(df, pd.DataFrame):
            raise TypeError("`df` needs to be a pandas DataFrame.")
        resolved_type: Record
        if isinstance(type, str):
            imports_type = cls.filter(uid=IMPORTS_UID).one_or_none()
            if imports_type is None:
                imports_type = cls(name="Imports", is_type=True)
                imports_type.uid = IMPORTS_UID
                imports_type = imports_type.save()
            existing_type = cls.filter(
                name=type, is_type=True, type=imports_type
            ).one_or_none()
            if existing_type is not None:
                raise ValueError(
                    f"type '{type}' already exists under 'Imports', please pass it as a Record object to reuse."
                )
            imports_schema = Schema.filter(uid=SCHEMA_IMPORTS_UID).one_or_none()
            if imports_schema is None:
                imports_schema = Schema(name="Imports", is_type=True)
                imports_schema.uid = SCHEMA_IMPORTS_UID
                imports_schema = imports_schema.save()
            inferred_schema = Schema.from_dataframe(df, name=type)
            if inferred_schema is None:
                raise ValueError(
                    "Could not infer a schema from dataframe columns. "
                    "Ensure dataframe columns map to existing Features, or pass an existing Record type object."
                )
            inferred_schema.type = imports_schema
            inferred_schema = inferred_schema.save()
            resolved_type = cls(
                name=type,
                is_type=True,
                type=imports_type,
                schema=inferred_schema,
            ).save()
        else:
            resolved_type = type
        if not resolved_type.is_type:
            raise ValueError("`type` needs to be a record type (`is_type=True`).")
        if resolved_type.name is None:
            raise ValueError("`type` needs to have a non-null `name`.")

        return RecordBatch(
            cls=cls,
            df=df,
            resolved_type=resolved_type,
            name_field=name_field,
        )

    @property
    def features(self) -> FeatureManager:
        """Manage the linked feature values.

        For examples, see :class:`~lamindb.Record` or :class:`~lamindb.models.FeatureManager`.
        """
        from ._feature_manager import FeatureManager

        return FeatureManager(self)

    @property
    def is_sheet(self) -> bool:
        """Check if record is a `sheet`, i.e., `self.is_type and self.schema is not None`."""
        return self.schema is not None and self.is_type

    def query_parents(self) -> QuerySet:
        """Query all parents of a record recursively.

        While `.parents` retrieves the direct parents, this method
        retrieves all ancestors of the current record.
        """
        return _query_relatives([self], "parents")  # type: ignore

    def query_children(self) -> QuerySet:
        """Query all children of a record recursively.

        While `.children` retrieves the direct children, this method
        retrieves all descendants of a parent.
        """
        return _query_relatives([self], "children")  # type: ignore

    def query_records(self) -> QuerySet:
        """Query records of sub types.

        While `.records` retrieves the records with the current type, this method
        also retrieves sub types and the records with sub types of the current type.
        """
        return _query_relatives([self], "records")  # type: ignore

    def _set_export_run(self, is_run_input: bool | Run | None = None) -> None:
        from lamindb.core._context import context
        from lamindb.models import Run, Transform

        if isinstance(is_run_input, Run):
            run = is_run_input
        elif is_run_input in {True, None}:
            if context.run is None:
                transform, _ = Transform.objects.get_or_create(
                    key="__lamindb_record_export__", kind="function"
                )
                run = Run(transform).save()
            else:
                run = context.run
        else:
            run = None
        self._export_run = run

    @class_and_instance_method
    def to_dataframe(
        cls_or_self,
        recurse: bool = False,
        is_run_input: bool | Run | None = None,
        **kwargs,
    ) -> pd.DataFrame:
        """Export to a pandas DataFrame.

        This is roughly equivalent to::

            ln.Record.filter(type=sample_type).to_dataframe(include="features")

        `to_dataframe()` ensures that the columns are ordered according to the schema of the type and encodes fields like `uid` and `name`.

        It will also track the record as an input to the current run.

        Args:
            recurse: Whether to include records of sub-types recursively.
            is_run_input: Whether to track the record as a run input.
            **kwargs: Keyword arguments passed to :meth:`~lamindb.models.QuerySet.to_dataframe`.
        """
        import pandas as pd

        if isinstance(cls_or_self, type):
            return type(cls_or_self).to_dataframe(cls_or_self, **kwargs)  # type: ignore
        if not cls_or_self.is_type:
            raise TypeError(
                "to_dataframe() can only be called on the class or on record type instance."
            )
        self = cls_or_self
        assert self.is_type, "Only types can be exported as dataframes"  # noqa: S101

        branch_ids = get_default_branch_ids()
        qs = (
            self.query_records()
            if recurse
            else self.records.filter(branch_id__in=branch_ids)
        )
        logger.important(f"exporting {qs.count()} records of '{self.name}'")
        if "order_by" not in kwargs:
            kwargs["order_by"] = "id"
        df = qs.to_dataframe(features="queryset", limit=None, **kwargs)
        encoded_id = encode_lamindb_fields_as_columns(self.__class__, "id")
        encoded_uid = encode_lamindb_fields_as_columns(self.__class__, "uid")
        encoded_name = encode_lamindb_fields_as_columns(self.__class__, "name")
        # encode the django id, uid and name fields
        if df.index.name == "id":
            df.index.name = encoded_id
        if "uid" in df.columns and encoded_uid not in df.columns:
            df = df.rename(columns={"uid": encoded_uid})
        if "name" in df.columns and encoded_name not in df.columns:
            df = df.rename(columns={"name": encoded_name})
        if self.schema is not None:
            all_features = self.schema.members.all()
            desired_order = all_features.to_list("name")  # only members is ordered!
            for feature in all_features:
                if feature.name not in df.columns:
                    df[feature.name] = pd.Series(
                        dtype=convert_to_pandas_dtype(feature._dtype_str)
                    )
        else:
            # sort alphabetically for now
            desired_order = df.columns[2:].tolist()
            desired_order.sort()
        df = reorder_subset_columns_in_df(df, desired_order, position=0)  # type: ignore
        self._set_export_run(is_run_input=is_run_input)
        self._export_run.input_records.add(self)
        return df.sort_index()  # order by id

    def to_artifact(
        self,
        key: str | None = None,
        suffix: str | None = None,
        is_run_input: bool | Run | None = None,
        **kwargs,
    ) -> Artifact:
        """Calls `to_dataframe()` to create an artifact.

        The format defaults to `.csv` unless the key specifies another format or suffix is passed.

        The `key` defaults to `sheet_exports/{self.name}{suffix}` unless a `key` is passed.

        Args:
            key: `str | None = None` The artifact key.
            suffix: `str | None = None` The suffix to append to the default key if no key is passed.
            is_run_input: Whether to track the record as a run input.
            **kwargs: Keyword arguments passed to :meth:`~lamindb.models.Record.to_dataframe`.
        """
        assert self.is_type, "Only types can be exported as artifacts."
        assert key is None or suffix is None, "Only one of key or suffix can be passed."
        if key is None:
            suffix = ".csv" if suffix is None else suffix
            key = f"sheet_exports/{self.name}{suffix}"
        description = f": {self.description}" if self.description is not None else ""
        return Artifact.from_dataframe(
            self.to_dataframe(is_run_input=is_run_input, **kwargs),
            key=key,
            description=f"Export of sheet {self.uid}{description}",
            schema=self.schema,
            csv_kwargs={"index": False},
            run=self._export_run,
        ).save()


# for storing JSON values in records
class RecordJson(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    record: Record = ForeignKey(Record, CASCADE, related_name="values_json")
    feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordjson")
    value: Any = JSONField(default=None, db_default=None)

    class Meta:
        app_label = "lamindb"
        # a list is modeled as a list in json, hence no multi-value association for the same feature unlike for
        # categorical/relational values
        unique_together = ("record", "feature")


# for storing record-like values in records
class RecordRecord(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    record: Record = ForeignKey(Record, CASCADE, related_name="values_record")
    feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordrecord")
    value: Record = ForeignKey(Record, PROTECT, related_name="links_record")

    class Meta:
        app_label = "lamindb"
        unique_together = ("record", "feature", "value")


# for storing ulabel-like values in records
class RecordULabel(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    record: Record = ForeignKey(Record, CASCADE, related_name="values_ulabel")
    feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordulabel")
    value: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_record")

    class Meta:
        # allows linking exactly one record to one ulabel per feature, because we likely don't want to have Many
        app_label = "lamindb"
        unique_together = ("record", "feature", "value")


# for storing user-like values in records
class RecordUser(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    record: Record = ForeignKey(Record, CASCADE, related_name="values_user")
    feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recorduser")
    value: User = ForeignKey(User, PROTECT, related_name="links_record")

    class Meta:
        # allows linking exactly one record to one user per feature, because we likely don't want to have Many
        app_label = "lamindb"
        unique_together = ("record", "feature", "value")


# for storing run-like values in records
class RecordRun(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    record: Record = ForeignKey(Record, CASCADE, related_name="values_run")
    feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordrun")
    value: Run = ForeignKey(Run, PROTECT, related_name="links_in_record")

    class Meta:
        # allows linking several records to a single run for the same feature because we'll likely need this
        app_label = "lamindb"
        unique_together = ("record", "feature", "value")


# for annotating runs with records
class RunRecord(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    run: Run = ForeignKey(Run, CASCADE, related_name="links_record")
    record: Record = ForeignKey(Record, PROTECT, related_name="links_run")
    feature: Feature = ForeignKey(
        Feature, PROTECT, null=True, related_name="links_runrecord"
    )
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    created_by: User = ForeignKey(
        "lamindb.User", PROTECT, default=current_user_id, related_name="+"
    )

    class Meta:
        app_label = "lamindb"
        unique_together = ("run", "record", "feature")


# for storing artifact-like values in records
class RecordArtifact(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    record: Record = ForeignKey(Record, CASCADE, related_name="values_artifact")
    feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordartifact")
    value: Artifact = ForeignKey(Artifact, PROTECT, related_name="links_in_record")

    class Meta:
        app_label = "lamindb"
        unique_together = ("record", "feature", "value")


# for annotating artifacts with records
class ArtifactRecord(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_record")
    record: Record = ForeignKey(Record, PROTECT, related_name="links_artifact")
    feature: Feature = ForeignKey(
        Feature, PROTECT, null=True, related_name="links_artifactrecord"
    )

    class Meta:
        app_label = "lamindb"
        unique_together = ("artifact", "record", "feature")


# for storing collection-like values in records
class RecordCollection(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    record: Record = ForeignKey(Record, CASCADE, related_name="values_collection")
    feature: Feature = ForeignKey(
        Feature, PROTECT, related_name="links_recordcollection"
    )
    value: Collection = ForeignKey(Collection, PROTECT, related_name="links_in_record")

    class Meta:
        app_label = "lamindb"
        unique_together = ("record", "feature", "value")


# for annotating collections with records
class CollectionRecord(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    collection: Collection = ForeignKey(
        Collection, CASCADE, related_name="links_record"
    )
    record: Record = ForeignKey(Record, PROTECT, related_name="links_collection")
    feature: Feature = ForeignKey(
        Feature, PROTECT, null=True, related_name="links_collectionrecord"
    )

    class Meta:
        app_label = "lamindb"
        unique_together = ("collection", "record", "feature")


# for storing transform-like values in records
class RecordTransform(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    record: Record = ForeignKey(Record, CASCADE, related_name="values_transform")
    feature: Feature = ForeignKey(
        Feature, PROTECT, related_name="links_recordtransform"
    )
    value: Transform = ForeignKey(Transform, PROTECT, related_name="links_in_record")

    class Meta:
        app_label = "lamindb"
        unique_together = ("record", "feature", "value")


# for annotating transforms with records
class TransformRecord(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_record")
    record: Record = ForeignKey(Record, PROTECT, related_name="links_transform")
    feature: Feature = ForeignKey(
        Feature, PROTECT, null=True, related_name="links_transformrecord"
    )
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now()
    )
    created_by: User = ForeignKey(
        "lamindb.User", PROTECT, default=current_user_id, related_name="+"
    )

    class Meta:
        app_label = "lamindb"
        unique_together = ("transform", "record", "feature")


================================================
FILE: lamindb/models/run.py
================================================
from __future__ import annotations

import os
import subprocess
import sys
from typing import TYPE_CHECKING, overload

from django.db import models
from django.db.models import (
    CASCADE,
    PROTECT,
    Q,
)
from lamin_utils import logger
from lamindb_setup import _check_instance_setup
from lamindb_setup import settings as setup_settings

from lamindb.base.fields import (
    BooleanField,
    CharField,
    DateTimeField,
    ForeignKey,
    TextField,
)
from lamindb.base.users import current_user_id
from lamindb.base.utils import strict_classmethod

from ..base.types import RUN_CODE_TO_STATUS
from ..base.uids import base62_16
from .can_curate import CanCurate
from .query_set import BasicQuerySet, QuerySet
from .sqlrecord import BaseSQLRecord, IsLink, SQLRecord

if TYPE_CHECKING:
    from datetime import datetime

    from lamindb.base.types import RunStatus

    from ._feature_manager import FeatureManager
    from .artifact import Artifact
    from .block import RunBlock
    from .collection import Collection
    from .feature import Feature, JsonValue
    from .project import Project
    from .query_manager import RelatedManager
    from .record import Record
    from .transform import Transform
    from .ulabel import ULabel


_TRACKING_READY: bool | None = None


def current_run() -> Run | None:
    global _TRACKING_READY

    if not _TRACKING_READY:
        _TRACKING_READY = _check_instance_setup()
    if _TRACKING_READY:
        import lamindb

        # also see get_run() in core._data
        run = lamindb.core._functions.get_current_tracked_run()
        if run is None:
            run = lamindb.context.run
        return run
    else:
        return None


class TracksRun(models.Model):
    """Base class tracking latest run, creating user, and `created_at` timestamp."""

    class Meta:
        abstract = True

    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of creation of record."""
    created_by: User = ForeignKey(
        "lamindb.User",
        PROTECT,
        editable=False,
        default=current_user_id,
        related_name="+",
    )
    """Creator of record."""
    run: Run | None = ForeignKey(
        "lamindb.Run", PROTECT, null=True, default=current_run, related_name="+"
    )
    """Run that created record."""


class TracksUpdates(models.Model):
    """Base class tracking previous runs and `updated_at` timestamp."""

    class Meta:
        abstract = True

    updated_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of last update to record."""


class User(BaseSQLRecord, CanCurate):
    """Users.

    Every :class:`~lamindb.models.SQLRecord` has a `created_by` field that links to the creating user.

    This registry is automatically populated with user identities from LaminHub in case the user authenticates.

    Examples:

        Query a user by handle::

            user = ln.User.get(handle="testuser1")
    """

    class Meta:
        app_label = "lamindb"

    _name_field: str = "handle"

    id: int = models.AutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(editable=False, unique=True, db_index=True, max_length=8)
    """Universal id, valid across DB instances."""
    handle: str = CharField(max_length=30, unique=True, db_index=True)
    """User handle, valid across DB instances (required)."""
    name: str | None = CharField(max_length=150, db_index=True, null=True)
    """Full name (optional)."""  # has to match hub specification, where it's also optional
    linked_in_records: RelatedManager[Record] = models.ManyToManyField(
        "Record", through="RecordUser", related_name="linked_users"
    )
    """This user is linked in these records as a value."""
    artifacts: RelatedManager[Artifact] = models.ManyToManyField(
        "Artifact",
        through="ArtifactUser",
        through_fields=("user", "artifact"),
        related_name="users",
    )
    """Artifacts annotated with this user."""
    created_artifacts: RelatedManager[Artifact]
    """Artifacts created by user."""
    created_transforms: RelatedManager[Transform]
    """Transforms created by user."""
    created_runs: RelatedManager[Run]
    """Runs created by user."""
    projects: RelatedManager[Project]
    """Projects this user is linked to (e.g. as member) ← :attr:`~lamindb.ProjectUser.project`."""
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of creation of object."""
    updated_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of last update to object."""

    @overload
    def __init__(
        self,
        uid: str,
        handle: str,
        name: str | None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)


class Run(SQLRecord, TracksUpdates):
    """Runs of transforms such as the executions of a script.

    Args:
        transform: :class:`~lamindb.Transform` A data transformation object.
        name: `str | None = None` A name.
        params: `dict | None = None` A dictionary of parameters.
        reference: `str | None = None` For instance, an external ID or URL.
        reference_type: `str | None = None` For instance, `redun_id`, `nextflow_id` or `url`.
        initiated_by_run: `Run | None = None` The `run` that triggers this `run`.

    See Also:
        :func:`~lamindb.track`
            Globally track a script or notebook run.
        :func:`~lamindb.step`
            Track a function executionwith this decorator.

    Examples:

        Create a run record::

            ln.Transform(key="Cell Ranger", version="7.2.0", kind="pipeline").save()
            transform = ln.Transform.get(key="Cell Ranger", version="7.2.0")
            run = ln.Run(transform)

        Track a global run of a notebook or script::

            ln.track()
            ln.context.run  # global run object

        You can pass parameters to `Run(transform, params=params)` or add them later::

            run.params = {
                "learning_rate": 0.01,
                "input_dir": "s3://my-bucket/mydataset",
                "downsample": True,
                "preprocess_params": {
                    "normalization_type": "cool",
                    "subset_highlyvariable": True,
                },
            }
            run.save()

        In contrast to `.params`, features are indexed in the `Feature` registry and can reference relational categorical values.
        If you want to link feature values, use::

            run.features.set_values({
                "experiment": "My experiment 1",
            })

        Guide: :ref:`track-run-parameters`
    """

    class Meta:
        app_label = "lamindb"

    _name_field: str = "started_at"

    id: int = models.BigAutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    # default uid was changed from base62_20 to base62_16 in 1.6.0
    uid: str = CharField(
        editable=False, unique=True, db_index=True, max_length=20, default=base62_16
    )
    """Universal id, valid across DB instances."""
    name: str | None = CharField(max_length=150, null=True, db_index=True)
    """An optional name for this run."""
    description: str | None = TextField(null=True)
    """An optional description for this run."""
    transform: Transform = ForeignKey("Transform", CASCADE, related_name="runs")
    """The transform that is being run ← :attr:`~lamindb.Transform.runs`."""
    entrypoint: str | None = CharField(max_length=255, null=True, db_index=True)
    """The entrypoint of the transform.

    This could be a function name or the entry point of a CLI or workflow manager.
    """
    started_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """The time this run started."""
    finished_at: datetime | None = DateTimeField(db_index=True, null=True, default=None)
    """The time this run finished or aborted."""
    # we don't want to make below a OneToOne because there could be the same trivial report
    # generated for many different runs
    report: Artifact | None = ForeignKey(
        "Artifact", PROTECT, null=True, related_name="_report_of", default=None
    )
    """The report of this run such as an `.html` or `.txt` file."""
    environment: Artifact | None = ForeignKey(
        "Artifact", PROTECT, null=True, related_name="_environment_of", default=None
    )
    """The computational environment for this run.

    For instance, `Dockerfile`, `docker image`, `requirements.txt`, `environment.yml`, etc.
    """
    plan: Artifact | None = ForeignKey(
        "Artifact", PROTECT, null=True, related_name="_plan_for_runs", default=None
    )
    """The (agent) plan for this run.

    Also see: :attr:`~lamindb.Run.initiated_by_run`.
    """
    input_records: RelatedManager[Record]
    """The collections serving as input for this run ← :attr:`~lamindb.Record.input_of_runs`."""
    output_records: RelatedManager[Record]
    """The collections created in this run ← :attr:`~lamindb.Record.run`."""
    input_artifacts: RelatedManager[Artifact]
    """The artifacts serving as input for this run ← :attr:`~lamindb.Artifact.input_of_runs`.
    """
    output_artifacts: RelatedManager[Artifact]
    """The artifacts created in this run ← :attr:`~lamindb.Artifact.run`.

    This does **not** include recreated artifacts, which are tracked via :attr:`~lamindb.Run.recreated_artifacts`.

    If you want to query created + recreated artifacts, use :meth:`~lamindb.Run.query_output_artifacts` instead.
    """
    recreated_artifacts: RelatedManager[Artifact]
    """The output artifacts that were recreated by this run ← :attr:`~lamindb.Artifact.recreating_runs`.

    Artifacts are *recreated* if they trigger a hash lookup match for an existing artifact.
    """
    input_collections: RelatedManager[Collection]
    """The collections serving as input for this run ← :attr:`~lamindb.Collection.input_of_runs`."""
    output_collections: RelatedManager[Collection]
    """The collections created in this run ← :attr:`~lamindb.Collection.run`."""
    recreated_collections: RelatedManager[Collection]
    """The output collections that were recreated by this run ← :attr:`~lamindb.Collection.recreating_runs`.

    Collections are *recreated* if they trigger a hash lookup match for an existing collection.
    """
    params: dict = models.JSONField(null=True)
    """Parameters (plain JSON values)."""
    json_values: RelatedManager[JsonValue] = models.ManyToManyField(
        "JsonValue", through="RunJsonValue", related_name="runs"
    )
    """Feature-indexed JSON values ← :attr:`~lamindb.JsonValue.runs`."""
    reference: str | None = CharField(max_length=255, db_index=True, null=True)
    """A reference like a URL or an external ID such as from a workflow manager."""
    reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
    """The type of the `reference` such as a workflow manager execution ID."""
    cli_args: str | None = CharField(max_length=1024, null=True, default=None)
    """CLI arguments if the run was invoked from the command line."""
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """The time of creation of this run."""
    created_by: User = ForeignKey(
        "User", CASCADE, default=current_user_id, related_name="created_runs"
    )
    """The creator of this run ← :attr:`~lamindb.User.created_runs`."""
    ulabels: RelatedManager[ULabel] = models.ManyToManyField(
        "ULabel", through="RunULabel", related_name="runs"
    )
    """The ulabels annotating this run ← :attr:`~lamindb.ULabel.runs`."""
    initiated_by_run: Run | None = ForeignKey(
        "Run", CASCADE, null=True, related_name="initiated_runs", default=None
    )
    """The run that initiated this run ← :attr:`~lamindb.Run.initiated_runs`."""
    initiated_runs: RelatedManager[Run]
    """The runs that were initiated by this run."""
    projects: RelatedManager[Project]
    """The projects annotating this run ← :attr:`~lamindb.Project.runs`."""
    ablocks: RelatedManager[RunBlock]
    """Attached blocks ← :attr:`~lamindb.RunBlock.run`."""
    records: RelatedManager[Record]
    """The records annotating this run ← :attr:`~lamindb.Record.runs`."""
    linked_in_records: RelatedManager[Record] = models.ManyToManyField(
        "Record", through="RecordRun", related_name="linked_runs"
    )
    """This run is linked in these records as a value ← :attr:`~lamindb.Record.linked_runs`."""
    artifacts: RelatedManager[Artifact] = models.ManyToManyField(
        "Artifact", through="ArtifactRun", related_name="runs"
    )
    """The artifacts annotated by this run ← :attr:`~lamindb.Artifact.runs`."""
    linked_artifacts: RelatedManager[Artifact] = models.ManyToManyField(
        "Artifact",
        through="RunArtifact",
        related_name="linked_by_runs",
    )
    """The artifacts linked by this run through the run's features ← :attr:`~lamindb.RunArtifact.artifact`."""
    _is_consecutive: bool | None = BooleanField(null=True)
    """Indicates whether code was consecutively executed. Is relevant for notebooks."""
    _status_code: int = models.SmallIntegerField(
        default=-3,
        db_default=-3,
        db_index=True,
    )
    """Status code of the run. See the status property for mapping to string."""

    @overload
    def __init__(
        self,
        transform: Transform,
        name: str | None = None,
        description: str | None = None,
        entrypoint: str | None = None,
        params: dict | None = None,
        reference: str | None = None,
        reference_type: str | None = None,
        initiated_by_run: Run | None = None,
        plan: Artifact | None = None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        if len(args) == len(self._meta.concrete_fields):
            super().__init__(*args, **kwargs)
            return None
        # now we proceed with the user-facing constructor
        if len(args) > 1:
            raise ValueError("Only one non-keyword arg allowed: transform")
        transform: Transform = None
        if "transform" in kwargs or len(args) == 1:
            transform = kwargs.pop("transform") if len(args) == 0 else args[0]
        name: str | None = kwargs.pop("name", None)
        description: str | None = kwargs.pop("description", None)
        entrypoint: str | None = kwargs.pop("entrypoint", None)
        params: dict | None = kwargs.pop("params", None)
        reference: str | None = kwargs.pop("reference", None)
        reference_type: str | None = kwargs.pop("reference_type", None)
        initiated_by_run: Run | None = kwargs.pop("initiated_by_run", None)
        report: Artifact | None = kwargs.pop("report", None)
        plan: Artifact | None = kwargs.pop("plan", None)
        if transform is None:
            raise TypeError("Pass transform parameter")
        if transform._state.adding:
            raise ValueError("Please save transform record before creating a run")
        if not len(kwargs) == 0:
            raise ValueError(
                f"Only transform, name, description, params, reference, reference_type, initiated_by_run, plan can be passed, but you passed: {kwargs}"
            )
        super().__init__(  # type: ignore
            transform=transform,
            name=name,
            description=description,
            entrypoint=entrypoint,
            params=params,
            reference=reference,
            reference_type=reference_type,
            initiated_by_run=initiated_by_run,
            report=report,
            plan=plan,
        )

    @property
    def status(self) -> RunStatus:
        """Run status.

        Get the status of the run:

        ===========  =====  ===========================
        status       code   description
        ===========  =====  ===========================
        `scheduled`  -3     The run is scheduled.
        `restarted`  -2     The run was restarted.
        `started`    -1     The run has started.
        `completed`  0      The run completed successfully.
        `errored`    1      The run ended with an error.
        `aborted`    2      The run was aborted.
        ===========  =====  ===========================

        The database stores the run status as an integer code in field `_status_code`.

        Example:

            See the status of a run::

                run.status
                #> 'completed'

            Query by status::

                ln.Run.filter(status="completed").to_dataframe()

        """
        return RUN_CODE_TO_STATUS[self._status_code]

    @property
    def features(self) -> FeatureManager:
        """Manage annotations with features.

        For examples, see :class:`~lamindb.Run` or :class:`~lamindb.models.FeatureManager`.
        """
        from ._feature_manager import FeatureManager

        return FeatureManager(self)

    def query_output_artifacts(
        self, include_recreated: bool = True
    ) -> QuerySet[Artifact]:
        """Query output artifacts including recreated ones.

        This runs the following query under the hood::

            ln.Artifact.filter(ln.Q(run=self) | ln.Q(recreating_runs=self)).distinct()

        Args:
            include_recreated: If `True`, return both originally created
                and recreated artifacts. If `False`, return only originally
                created artifacts.

        Returns:
            A queryset of :class:`~lamindb.Artifact` objects.

        See Also:
            :attr:`~lamindb.Run.output_artifacts`
                `QuerySet` of originally created artifacts.
            :attr:`~lamindb.Run.recreated_artifacts`
                `QuerySet` of recreated artifacts.
        """
        if not include_recreated:
            return self.output_artifacts.all()
        else:
            return self.output_artifacts.model.filter(
                Q(run=self) | Q(recreating_runs=self)
            ).distinct()

    @strict_classmethod
    def filter(
        cls,
        *queries,
        **expressions,
    ) -> QuerySet:
        """Query a set of artifacts.

        Args:
            *queries: `Q` expressions.
            **expressions: Params, fields, and values passed via the Django query syntax.

        See Also:
            - Guide: :doc:`docs:registries`

        Examples:

            Query by fields::

                ln.Run.filter(key="examples/my_file.parquet")

            Query by params::

                ln.Run.filter(hyperparam_x=100)
        """
        # from Registry metaclass
        return type(cls).filter(cls, *queries, **expressions)


def _permanent_delete_runs(runs: Run | QuerySet) -> None:
    """Execute bulk DELETE on runs and spawn artifact cleanup. Used by QuerySet and single-run paths."""
    if isinstance(runs, Run):
        db = runs._state.db or "default"
        first_run_uid = runs.uid
        artifact_ids = []
        if runs.environment_id:
            artifact_ids.append(runs.environment_id)
        if runs.report_id:
            artifact_ids.append(runs.report_id)
        super(BaseSQLRecord, runs).delete()
    else:
        db = runs.db or "default"
        rows = list(runs.values_list("uid", "report_id", "environment_id"))
        if rows:
            first_run_uid = rows[0][0]
        else:
            return
        artifact_ids = list({aid for r in rows for aid in r[1:3] if aid is not None})
        super(BasicQuerySet, runs).delete()
    if artifact_ids:
        ids_str = ",".join(map(str, artifact_ids))
        instance = db if db not in (None, "default") else setup_settings.instance.slug
        # spawn background subprocess to delete orphaned report/env artifacts
        cmd: list[str] = [
            sys.executable,
            "-m",
            "lamindb.models._run_cleanup",
            "--instance",
            instance,
            "--ids",
            ids_str,
            "--run-uid",
            first_run_uid,
        ]
        proc = subprocess.Popen(
            cmd,
            start_new_session=True,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
            env=os.environ,
        )
        log_path = setup_settings.cache_dir / f"run_cleanup_logs_{first_run_uid}.txt"
        logger.important(
            f"spawned run cleanup subprocess (pid={proc.pid}): {log_path}\n  {' '.join(cmd)}"
        )


class RunJsonValue(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    run: Run = ForeignKey(Run, CASCADE, related_name="links_jsonvalue")
    # we follow the lower() case convention rather than snake case for link models
    jsonvalue: JsonValue = ForeignKey("JsonValue", PROTECT, related_name="links_run")
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of creation of record."""
    created_by: User = ForeignKey(
        "lamindb.User", PROTECT, default=current_user_id, related_name="+"
    )
    """Creator of record."""

    class Meta:
        app_label = "lamindb"
        unique_together = ("run", "jsonvalue")


# for storing artifact-like values in runs
# compare RunRecord as opposed to RecordRun
class RunArtifact(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    run: Run = ForeignKey(Run, CASCADE, related_name="values_artifact")
    artifact: Artifact = ForeignKey("Artifact", PROTECT, related_name="links_in_run")
    feature: Feature | None = ForeignKey(
        "Feature", PROTECT, null=True, related_name="links_runartifact", default=None
    )

    class Meta:
        app_label = "lamindb"
        unique_together = ("run", "artifact", "feature")


================================================
FILE: lamindb/models/save.py
================================================
# ruff: noqa: TC004
from __future__ import annotations

import os
import shutil
import traceback
from collections import defaultdict
from datetime import datetime
from typing import TYPE_CHECKING

from django.db import IntegrityError, transaction
from django.utils.functional import partition
from lamin_utils import logger
from lamindb_setup.core.upath import LocalPathClasses, UPath

from ..core._settings import settings
from .sqlrecord import (
    UNIQUE_FIELD_NAMES,
    SQLRecord,
    parse_violated_field_from_error_message,
)

if TYPE_CHECKING:
    from collections.abc import Iterable

    from .artifact import Artifact


def save(
    records: Iterable[SQLRecord],
    ignore_conflicts: bool | None = False,
    batch_size: int = 10000,
) -> None:
    """Bulk save records.

    Note:

        This is a much faster than saving records using ``record.save()``.

    Warning:

        Bulk saving neither automatically creates related records nor updates
        existing records! Use ``record.save()`` for these use cases.

    Args:
        records: Multiple :class:`~lamindb.models.SQLRecord` objects.
        ignore_conflicts: If `True`, do not error if some records violate a unique or another constraint.
            However, it won't inplace update the id fields of records.
            If you need records with ids, you need to query them from the database.
        batch_size: Number of records to process in each batch.
            Large batch sizes can improve performance but may lead to memory issues.

    Examples:

        Save a list of records:

        >>> labels = [ln.ULabel(f"Label {i}") for i in range(10)]
        >>> ln.save(projects)

        For a single record, use ``record.save()``:

        >>> transform = ln.Transform(key="My pipeline")
        >>> transform.save()

        Update a single existing record:

        >>> transform = ln.Transform.get("0Cb86EZj")
        >>> transform.description = "New description"
        >>> transform.save()

    """
    from .artifact import Artifact

    if isinstance(records, SQLRecord):
        raise ValueError("Please use record.save() if saving a single record.")

    # previously, this was all set based,
    # but models without primary keys aren't hashable
    # we distinguish between artifacts and non-artifacts
    # for artifacts, we want to bulk-upload rather than upload one-by-one
    non_artifacts, artifacts = partition(lambda r: isinstance(r, Artifact), records)
    if non_artifacts:
        non_artifacts_old, non_artifacts_new = partition(
            lambda r: r._state.adding or r.pk is None, non_artifacts
        )
        bulk_create(
            non_artifacts_new, ignore_conflicts=ignore_conflicts, batch_size=batch_size
        )
        if non_artifacts_old:
            bulk_update(non_artifacts_old, batch_size=batch_size)
        non_artifacts_with_parents = [
            r for r in non_artifacts_new if hasattr(r, "_parents")
        ]
        if len(non_artifacts_with_parents) > 0:
            # this can only happen within bionty right now!!
            # we might extend to core lamindb later
            from bionty.core import add_ontology

            add_ontology(non_artifacts_with_parents)
        records_with_lazy_features = [
            record
            for record in non_artifacts
            if record.__class__.__name__ == "Record" and hasattr(record, "_features")
        ]
        if records_with_lazy_features:
            from ._feature_manager import bulk_set_features_in_records

            bulk_set_features_in_records(records_with_lazy_features)

    if artifacts:
        with transaction.atomic():
            for record in artifacts:
                # will switch to True after the successful upload / saving
                if getattr(record, "_local_filepath", None) is not None and getattr(
                    record, "_to_store", False
                ):
                    record._storage_ongoing = True
                record._save_skip_storage()
        using_key = settings._using_key
        store_artifacts(artifacts, using_key=using_key)

    # this function returns None as potentially 10k records might be saved
    # refreshing all of them from the DB would mean a severe performance penalty
    # 2nd reason: consistency with Django Model.save(), which also returns None
    return None


def bulk_create(
    records: Iterable[SQLRecord],
    ignore_conflicts: bool | None = False,
    batch_size: int = 10000,
):
    """Create records in batches for safety and performance.

    Args:
        records: Iterable of SQLRecord objects to create
        ignore_conflicts: Whether to ignore conflicts during creation
        batch_size: Number of records to process in each batch.
    """
    records_by_orm = defaultdict(list)
    for record in records:
        records_by_orm[record.__class__].append(record)

    for registry, records_list in records_by_orm.items():
        total_records = len(records_list)
        model_name = registry.__name__
        if total_records > batch_size:
            logger.important(
                f"starting creation of {total_records} {model_name} records in batches of {batch_size}"
            )

        # Process records in batches
        for i in range(0, len(records_list), batch_size):
            batch = records_list[i : i + batch_size]
            batch_num = (i // batch_size) + 1
            total_batches = (total_records + batch_size - 1) // batch_size

            if total_records > batch_size:
                logger.info(
                    f"processing batch {batch_num}/{total_batches} for {model_name}: {len(batch)} records"
                )
            try:
                registry.objects.bulk_create(batch, ignore_conflicts=ignore_conflicts)
            # handle unique constraint violations due to non-default branches
            except IntegrityError as e:
                error_msg = str(e)
                if any(field in error_msg for field in UNIQUE_FIELD_NAMES) and (
                    "UNIQUE constraint failed" in error_msg
                    or "duplicate key value violates unique constraint" in error_msg
                ):
                    unique_fields = parse_violated_field_from_error_message(error_msg)

                    # Build tuples of unique field values for each record
                    unique_field_values = [
                        tuple(getattr(r, field) for field in unique_fields)
                        for r in batch
                    ]

                    # Build Q objects for multi-field lookup
                    from django.db.models import Q

                    q_objects = Q()
                    for values in unique_field_values:
                        field_kwargs = {
                            unique_fields[i]: values[i]
                            for i in range(len(unique_fields))
                        }
                        q_objects |= Q(**field_kwargs)

                    # Query against non-default branches
                    pre_existing_records_not_main_branch = registry.objects.filter(
                        q_objects
                    ).exclude(branch_id=1)

                    # Get the unique field value tuples that already exist
                    pre_existing_value_tuples = {
                        tuple(getattr(rec, field) for field in unique_fields)
                        for rec in pre_existing_records_not_main_branch
                    }

                    # Records that can be saved normally (not in non-default branches)
                    records_main_branch = [
                        r
                        for r in batch
                        if tuple(getattr(r, field) for field in unique_fields)
                        not in pre_existing_value_tuples
                    ]
                    save(records_main_branch)

                    # Now move the pre-existing records to the main branch
                    if pre_existing_value_tuples:
                        unique_fields_str = ", ".join(unique_fields)
                        logger.warning(
                            f"some {model_name} records with the same ({unique_fields_str}) already exist in non-default branches - moving them to the default branch"
                        )
                        pre_existing_records_to_move = [
                            r
                            for r in batch
                            if tuple(getattr(r, field) for field in unique_fields)
                            in pre_existing_value_tuples
                        ]
                        for record in pre_existing_records_to_move:
                            record.save()
                else:
                    raise e


def bulk_update(
    records: Iterable[SQLRecord],
    ignore_conflicts: bool | None = False,
    batch_size: int = 10000,
):
    """Update records in batches for safety and performance.

    Args:
        records: Iterable of SQLRecord objects to update
        ignore_conflicts: Whether to ignore conflicts during update (currently unused but kept for consistency)
        batch_size: Number of records to process in each batch. If None, processes all at once.
    """
    records_by_orm = defaultdict(list)
    for record in records:
        records_by_orm[record.__class__].append(record)

    for registry, records_list in records_by_orm.items():
        total_records = len(records_list)
        model_name = registry.__name__
        if total_records > batch_size:
            logger.warning(
                f"starting update for {total_records} {model_name} records in batches of {batch_size}"
            )

        field_names = [
            field.name
            for field in registry._meta.fields
            if (field.name != "created_at" and field.name != "id")
        ]

        # Process records in batches
        for i in range(0, len(records_list), batch_size):
            batch = records_list[i : i + batch_size]
            batch_num = (i // batch_size) + 1
            total_batches = (total_records + batch_size - 1) // batch_size

            if total_records > batch_size:
                logger.info(
                    f"processing batch {batch_num}/{total_batches} for {model_name}: {len(batch)} records"
                )
            registry.objects.bulk_update(batch, field_names)


# This is also used within Artifact.save()
def check_and_attempt_upload(
    artifact: Artifact,
    using_key: str | None = None,
    access_token: str | None = None,
    print_progress: bool = True,
    **kwargs,
) -> Exception | None:
    # kwargs are propagated to .upload_from in the end
    # if Artifact object is either newly instantiated or replace() was called on
    # a local env it will have a _local_filepath and needs to be uploaded
    if getattr(artifact, "_local_filepath", None) is not None:
        try:
            storage_path, cache_path = upload_artifact(
                artifact,
                using_key,
                access_token=access_token,
                print_progress=print_progress,
                **kwargs,
            )
        except Exception as exception:
            logger.warning(f"could not upload artifact: {artifact}")
            # clear dangling storages if we were actually uploading or saving
            if getattr(artifact, "_to_store", False):
                # avoid root-level import of core.storage module
                from ..core.storage import paths

                artifact._clear_storagekey = paths.auto_storage_key_from_artifact(
                    artifact
                )  # type: ignore
            return exception
        # copies (if on-disk) or moves the temporary file (if in-memory) to the cache
        if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
            # this happens only after the actual upload was performed
            # we avoid failing here in case any problems happen in copy_or_move_to_cache
            # because the cache copying or cleanup is not absolutely necessary
            try:
                copy_or_move_to_cache(artifact, storage_path, cache_path)
            except Exception as e:
                if not str(e).startswith(
                    "[WinError 32] The process cannot access the file "
                    "because it is being used by another process"
                ):
                    # ignore WinError 32 error, this just means that the file is still open on save
                    # it is saved at this point, so not a big deal if copy or move to cache fails
                    # this mostly happens for run logs
                    # just ignore without a warning
                    logger.warning(f"A problem with cache on saving: {e}")
        # after successful upload, we should remove the attribute so that another call
        # call to save won't upload again, the user should call replace() then
        del artifact._local_filepath
    # returning None means proceed (either success or no action needed)
    return None


def copy_or_move_to_cache(
    artifact: Artifact, storage_path: UPath, cache_path: UPath | None
):
    local_path = artifact._local_filepath

    # in-memory cases
    if local_path is None or not local_path.exists():
        return None

    local_path = local_path.resolve()
    is_dir = local_path.is_dir()
    cache_dir = settings.cache_dir

    # just delete from the cache dir if storage_path is local
    if cache_path is None:
        if (
            local_path.as_posix() != storage_path.as_posix()
            and cache_dir in local_path.parents
        ):
            if is_dir:
                shutil.rmtree(local_path)
            else:
                local_path.unlink()
        return None
    # non-local storage_path further
    if local_path != cache_path:
        if cache_path.exists():
            logger.important_hint(
                f"replacing the existing cache path {cache_path.as_posix()}"
            )
            if cache_path.is_dir():
                shutil.rmtree(cache_path)
            else:
                cache_path.unlink()
        else:
            cache_path.parent.mkdir(parents=True, exist_ok=True)
        if cache_dir in local_path.parents:
            local_path.replace(cache_path)
        else:
            if is_dir:
                shutil.copytree(local_path, cache_path)
            else:
                shutil.copy(local_path, cache_path)
    # make sure that the cached version is older than the cloud one
    mts = datetime.now().timestamp() + 1.0
    if is_dir:
        files = (file for file in cache_path.rglob("*") if file.is_file())
        for file in files:
            os.utime(file, times=(mts, mts))
    else:
        os.utime(cache_path, times=(mts, mts))


# This is also used within Artifact.save()
def check_and_attempt_clearing(
    artifact: Artifact,
    raise_file_not_found_error: bool = True,
    using_key: str | None = None,
) -> Exception | None:
    # this is a clean-up operation after replace() was called
    # or if there was an exception during upload
    if hasattr(artifact, "_clear_storagekey"):
        try:
            if artifact._clear_storagekey is not None:  # type: ignore
                # avoid root-level import of core.storage module
                from ..core.storage import paths

                delete_msg = paths.delete_storage_using_key(
                    artifact,
                    artifact._clear_storagekey,  # type: ignore
                    raise_file_not_found_error=raise_file_not_found_error,
                    using_key=using_key,
                )
                if delete_msg != "did-not-delete":
                    logger.success(
                        f"deleted stale object at storage key {artifact._clear_storagekey}"  # type: ignore
                    )
                artifact._clear_storagekey = None  # type: ignore
        except Exception as exception:
            return exception
    # returning None means proceed (either success or no action needed)
    return None


def store_artifacts(
    artifacts: Iterable[Artifact], using_key: str | None = None
) -> None:
    """Upload artifacts in a list of database-committed artifacts to storage.

    If any upload fails, subsequent artifacts are cleaned up from the DB.
    """
    from .artifact import Artifact

    exception: Exception | None = None
    # because uploads might fail, we need to maintain a new list of the succeeded uploads
    stored_artifacts = []

    # upload new local artifacts
    for artifact in artifacts:
        # failure here sets ._clear_storagekey
        # for cleanup below
        exception = check_and_attempt_upload(artifact, using_key)
        if exception is not None:
            break

        stored_artifacts += [artifact]
        # update to show successful saving
        # only update if _storage_ongoing was set to True before
        # this should be a single transaction for the updates of all the artifacts
        # but then it would just abort all artifacts, even those successfully stored before
        # TODO: there should also be some kind of exception handling here
        # but this requires refactoring
        if artifact._storage_ongoing:
            artifact._storage_ongoing = False
            # each .save() is a separate transaction below
            super(Artifact, artifact).save()
        # if check_and_attempt_upload was successful
        # then this can have only ._clear_storagekey from .replace
        exception = check_and_attempt_clearing(
            artifact, raise_file_not_found_error=True, using_key=using_key
        )
        if exception is not None:
            logger.warning(f"clean up of {artifact._clear_storagekey} failed")  # type: ignore
            break

    if exception is not None:
        # clean up metadata for artifacts not uploaded to storage
        with transaction.atomic():
            for artifact in artifacts:
                if artifact not in stored_artifacts:
                    artifact._delete_skip_storage()
                    # clean up storage after failure in check_and_attempt_upload
                    exception_clear = check_and_attempt_clearing(
                        artifact, raise_file_not_found_error=False, using_key=using_key
                    )
                    if exception_clear is not None:
                        logger.warning(
                            f"clean up of {artifact._clear_storagekey} after the upload error failed"  # type: ignore
                        )
        error_message = prepare_error_message(artifacts, stored_artifacts, exception)
        # this is bad because we're losing the original traceback
        # needs to be refactored - also, the orginal error should be raised
        raise RuntimeError(error_message)
    return None


def prepare_error_message(records, stored_artifacts, exception) -> str:
    if len(stored_artifacts) == 0:
        error_message = (
            "No entries were uploaded or committed"
            " to the database. See error message:\n\n"
        )
    else:
        error_message = (
            "The following entries have been"
            " successfully uploaded and committed to the database:\n"
        )
        for record in stored_artifacts:
            error_message += (
                f"- {', '.join(record.__repr__().split(', ')[:3]) + ', ...)'}\n"
            )
        error_message += "\nSee error message:\n\n"
    error_message += f"{str(exception)}\n\n{traceback.format_exc()}"
    return error_message


def upload_artifact(
    artifact,
    using_key: str | None = None,
    access_token: str | None = None,
    print_progress: bool = True,
    **kwargs,
) -> tuple[UPath, UPath | None]:
    """Store and add file and its linked entries."""
    # kwargs are propagated to .upload_from in the end
    # can't currently use  filepath_from_artifact here because it resolves to ._local_filepath
    # avoid root-level import of core.storage module
    from ..core.storage import paths

    storage_key = paths.auto_storage_key_from_artifact(artifact)
    storage_path, storage_settings = paths.attempt_accessing_path(
        artifact, storage_key, using_key=using_key, access_token=access_token
    )
    if getattr(artifact, "_to_store", False):
        logger.save(f"storing artifact '{artifact.uid}' at '{storage_path}'")
        paths.store_file_or_folder(
            artifact._local_filepath,
            storage_path,
            print_progress=print_progress,
            **kwargs,
        )

    if isinstance(storage_path, LocalPathClasses):
        cache_path = None
    else:
        cache_key = paths._cache_key_from_artifact_storage(artifact, storage_settings)
        cache_path = storage_settings.cloud_to_local_no_update(
            storage_path, cache_key=cache_key
        )

    return storage_path, cache_path


================================================
FILE: lamindb/models/schema.py
================================================
from __future__ import annotations

import warnings
from typing import TYPE_CHECKING, Any, Type, overload

import numpy as np
from django.db import models
from django.db.models import CASCADE, PROTECT, ManyToManyField, Q
from lamin_utils import logger
from lamindb_setup.core import deprecated
from lamindb_setup.core.hashing import HASH_LENGTH, hash_string

from lamindb.base.fields import (
    BooleanField,
    CharField,
    ForeignKey,
    IntegerField,
    TextField,
)
from lamindb.base.types import FieldAttr, ListLike
from lamindb.base.uids import base62_16
from lamindb.base.utils import class_and_instance_method
from lamindb.errors import FieldValidationError, InvalidArgument
from lamindb.models.feature import parse_cat_dtype

from ..errors import ValidationError
from ._describe import describe_schema, format_rich_tree
from ._relations import (
    dict_related_model_to_related_name,
    get_related_name,
)
from .can_curate import CanCurate
from .feature import (
    Feature,
    serialize_dtype,
    serialize_pandas_dtype,
)
from .has_parents import _query_relatives
from .query_set import QuerySet, SQLRecordList
from .run import TracksRun, TracksUpdates
from .sqlrecord import (
    BaseSQLRecord,
    HasType,
    IsLink,
    Registry,
    SQLRecord,
    _get_record_kwargs,
    init_self_from_db,
    update_attributes,
)

if TYPE_CHECKING:
    import pandas as pd
    from django.db.models.query_utils import DeferredAttribute

    from .artifact import Artifact
    from .block import SchemaBlock
    from .project import Project
    from .query_manager import RelatedManager
    from .record import Record


NUMBER_TYPE = "num"
DICT_KEYS_TYPE = type({}.keys())  # type: ignore


def validate_features(features: list[SQLRecord]) -> SQLRecord:
    """Validate and return feature type."""
    try:
        if len(features) == 0:
            raise ValueError("Provide list of features with at least one element")
    except TypeError:
        raise ValueError(
            "Please pass a ListLike of features, not a single feature"
        ) from None
    if not hasattr(features, "__getitem__"):
        raise TypeError("features has to be list-like")
    if not isinstance(features[0], SQLRecord):
        raise TypeError(
            "features has to store feature records! use .from_values() otherwise"
        )
    feature_types = {feature.__class__ for feature in features}
    if len(feature_types) > 1:
        raise TypeError("schema can only contain a single type")
    for feature in features:
        if feature._state.adding:
            raise ValueError("Can only construct feature sets from validated features")
    return next(iter(feature_types))  # return value in set of cardinality 1


def get_features_config(
    features: list[SQLRecord] | tuple[SQLRecord, dict],
) -> tuple[list[SQLRecord], list[tuple[SQLRecord, dict]]]:
    """Get features and their config from the return of feature.with_config()."""
    features_list = []
    configs = []
    try:
        for feature in features:
            if isinstance(feature, tuple):
                features_list.append(feature[0])
                configs.append(feature)  # store the tuple in configs
            else:
                features_list.append(feature)
        return features_list, configs  # type: ignore
    except TypeError:
        return features, configs  # type: ignore


class SchemaOptionals:
    """Manage and access optional features in a schema."""

    def __init__(self, schema) -> None:
        self.schema = schema

    def get_uids(self) -> list[str]:
        """Get the uids of the optional features.

        Does **not** need an additional query to the database, while `get()` does.
        """
        if (
            self.schema._aux is not None
            and "af" in self.schema._aux
            and "1" in self.schema._aux["af"]
        ):
            return self.schema._aux["af"]["1"]
        else:
            return []

    def get(self) -> QuerySet:
        """Get the optional features."""
        uids = self.get_uids()
        if uids:
            return Feature.objects.filter(uid__in=uids).order_by("links_schema__id")
        else:
            return Feature.objects.none()  # empty QuerySet

    def set(self, features: list[Feature]) -> None:
        """Set the optional features (overwrites whichever schemas are currently optional)."""
        if not isinstance(features, list) or not all(
            isinstance(f, Feature) for f in features
        ):
            raise TypeError("features must be a list of Feature records!")
        self.schema._aux = self.schema._aux or {}
        if len(features) > 0:
            self.schema._aux.setdefault("af", {})["1"] = [f.uid for f in features]

    def remove(self, features: Feature | list[Feature]) -> None:
        """Make one or multiple features required by removing them from the set of optional features."""
        if not isinstance(features, list):
            features = [features]
        if not all(isinstance(f, Feature) for f in features):
            raise TypeError("features must be a list of Feature records!")
        if len(features) > 0:
            self.schema._aux = self.schema._aux or {}
            if "1" in self.schema._aux.get("af", {}):
                for feature in features:
                    self.schema._aux["af"]["1"].remove(feature.uid)

    def add(self, features: Feature | list[Feature]) -> None:
        """Make one or multiple features optional by adding them to the set of optional features."""
        self.schema._aux = self.schema._aux or {}
        if not isinstance(features, list):
            features = [features]
        if not all(isinstance(f, Feature) for f in features):
            raise TypeError("features must be a list of Feature records!")
        if len(features) > 0:
            if "1" not in self.schema._aux.setdefault("af", {}):
                self.set(features)
            else:
                self.schema._aux.setdefault("af", {})["1"].extend(
                    [f.uid for f in features]
                )


KNOWN_SCHEMAS = {  # by hash
    "kMi7B_N88uu-YnbTLDU-DA": "0000000000000000",  # valid_features
    "1gocc_TJ1RU2bMwDRK-WUA": "0000000000000001",  # valid_ensembl_gene_ids
    "UR_ozz2VI2sY8ckXop2RAg": "0000000000000002",  # anndata_ensembl_gene_ids_and_valid_features_in_obs (itype='Composite')
    "aqGWHvyY49W_PHELUMiBMw": "0000000000000002",  # anndata_ensembl_gene_ids_and_valid_features_in_obs (itype=None)
}


class Schema(SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates):
    """Schemas of datasets such as column sets of dataframes.

    .. note::

        To create a schema, at least one of the following parameters must be passed:

        - `features` - a list of `Feature` objects
        - `itype` - the identifier type, e.g., `Feature` or `bt.Gene.ensembl_gene_id`
        - `slots` - a dictionary mapping slots to :class:`~lamindb.Schema` objects, e.g., for an `AnnData`, `{"obs": Schema(...), "var.T": Schema(...)}`
        - `is_type=True` - a *schema type* to group schemas, e.g., "ProteinPanel"

    Args:
        features: `list[SQLRecord] | list[tuple[Feature, dict]] | None = None` Feature
            records, e.g., `[Feature(...), Feature(...)]` or features with their config, e.g., `[Feature(...).with_config(optional=True)]`.
        slots: `dict[str, Schema] | None = None` A dictionary mapping slot names to :class:`~lamindb.Schema` objects to create a _composite_ schema.
        name: `str | None = None` Name of the schema.
        description: `str | None = None` Description of the schema.
        itype: `str | None = None` Feature identifier type to validate against, e.g., `ln.Feature` or `bt.Gene.ensembl_gene_id`.
            Is automatically set to the type of the passed `features`.
        type: `Schema | None = None` Define schema types like `ln.Schema(name="ProteinPanel", is_type=True)`.
        is_type: `bool = False` Whether the schema is a type.
        index: `Feature | None = None` A `Feature` record to validate an index of a `DataFrame` and therefore also, e.g., `AnnData` obs and var indices.
        flexible: `bool | None = None` Whether to include any feature of the same `itype` during validation & annotation.
            If `features` is passed, defaults to `False` so that, e.g., additional columns of a `DataFrame` encountered during validation are disregarded.
            If `features` is not passed, defaults to `True`.
        otype: `str | None = None` An object type to define the structure of a composite schema, e.g., `"DataFrame"`, `"AnnData"`.
        dtype: `str | None = None` A `dtype` to assume for all features in the schema (e.g., "num", float, int).
            Defaults to `None` if `itype` is `Feature`. Otherwise to `"num"`, e.g., if `itype` is `bt.Gene.ensembl_gene_id`.
        minimal_set: `bool = True` Whether all passed features are required by default.
            See :attr:`~lamindb.Schema.optionals` for more-fine-grained control.
        maximal_set: `bool = False` Whether additional features are allowed.
        ordered_set: `bool = False` Whether features are required to be ordered.
        coerce: `bool | None = None` When True, attempts to coerce values to the specified dtype
            during validation, see :attr:`~lamindb.Schema.coerce`.
        n_members: `int | None = None` A manual way of specifying the number of features in the schema. Is inferred from `features` if passed.

    See Also:
        :meth:`~lamindb.Artifact.from_dataframe`
            Validate & annotate a `DataFrame` with a schema.
        :meth:`~lamindb.Artifact.from_anndata`
            Validate & annotate an `AnnData` with a schema.
        :meth:`~lamindb.Artifact.from_mudata`
            Validate & annotate an `MuData` with a schema.
        :meth:`~lamindb.Artifact.from_spatialdata`
            Validate & annotate a `SpatialData` with a schema.

    Examples:

        A schema with a single required feature::

            import lamindb as ln

            schema = ln.Schema([ln.Feature(name="required_feature", dtype=str).save()]).save()

        A schema that constrains feature identifiers to be a valid feature names::

            schema = ln.Schema(itype=ln.Feature)  # uses Feature.name as identifier type

        Or valid Ensembl gene ids::

            import bionty as bt

            schema = ln.Schema(itype=bt.Gene.ensembl_gene_id)

        A `flexible` schema that *requires* a single feature but *also* validates & annotates additional features with registered feature identifiers::

            schema = ln.Schema(
                [ln.Feature(name="required_feature", dtype=str).save()],
                itype=ln.Feature,
                flexible=True,
            ).save()

        Create a schema type to group schemas::

            protein_panel = ln.Schema(name="ProteinPanel", is_type=True).save()
            schema = ln.Schema(itype=bt.CellMarker, type=protein_panel).save()

        Validate the `index` of a `DataFrame`::

            schema = ln.Schema(
                [ln.Feature(name="required_feature", dtype=str).save()],
                index=ln.Feature(name="sample", dtype=ln.ULabel).save(),
            ).save()

        Mark a feature as `optional`::

            schema = ln.Schema([
                ln.Feature(name="required_feature", dtype=str).save(),
                ln.Feature(name="feature2", dtype=int).save().with_config(optional=True),
            ]).save()

        Parse & validate feature identifier values::

            schema = ln.Schema.from_values(
                adata.var["ensemble_id"],
                field=bt.Gene.ensembl_gene_id,
                organism="mouse",
            ).save()

        Create a schema from a `DataFrame`::

            df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
            schema = ln.Schema.from_dataframe(df)
    """

    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):
        abstract = False
        app_label = "lamindb"
        # also see raw SQL constraints for `is_type` and `type` FK validity in migrations

    _name_field: str = "name"
    _aux_fields: dict[str, tuple[str, type]] = {
        "1": ("optionals", list[str]),
        "3": ("index_feature_uid", str),
    }

    id: int = models.AutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(max_length=16, unique=True, db_index=True, editable=False)
    """A universal id."""
    name: str | None = CharField(max_length=150, null=True, db_index=True)
    """A name."""
    description: str | None = TextField(null=True)
    """A description."""
    n_members: int | None = IntegerField(null=True, default=None)
    """Number of features in the schema. None for type-like schemas."""
    coerce: bool | None = BooleanField(null=True, default=None)
    """Whether dtypes should be coerced during validation. None for type-like schemas."""
    flexible: bool | None = BooleanField(null=True, default=None)
    """Indicates how to handle validation and annotation in case features are not defined.

    Examples:
        Make a rigid schema flexible::

            schema = ln.Schema.get(name="my_schema")
            schema.flexible = True
            schema.save()

        During schema creation::

            # if you're not passing features but just defining the itype, defaults to flexible = True
            schema = ln.Schema(itype=ln.Feature).save()
            # schema.flexible is True

            # if you're passing features, defaults to flexible = False
            schema = ln.Schema(
                features=[ln.Feature(name="my_required_feature", dtype=int).save()],
            )
            # schema.flexible is False

            # you can also validate & annotate features in addition to those that you're explicitly defining:
            schema = ln.Schema(
                features=[ln.Feature(name="my_required_feature", dtype=int).save()],
                flexible=True,
            )
            # schema.flexible is True
    """
    type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="schemas")
    """Type of schema.

    Allows to group schemas by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.

    You can define types via `ln.Schema(name="ProteinPanel", is_type=True)`.

    Here are a few more examples for type names: `'ExpressionPanel'`, `'ProteinPanel'`, `'Multimodal'`, `'Metadata'`, `'Embedding'`.
    """
    schemas: RelatedManager[Schema]
    """Schemas of this type (can only be non-empty if `is_type` is `True`)."""
    itype: str | None = CharField(
        max_length=120, db_index=True, null=True, editable=False
    )
    """A field of a registry that stores feature identifier types, e.g., `'Feature.name'` or `'bionty.Gene.ensembl_gene_id'`.
    Defaults to the default name field if a registry is passed (passing `Feature` would result in `Feature.name`).

    Depending on `itype`, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
    """
    otype: str | None = CharField(max_length=64, db_index=True, null=True)
    """Default Python object type, e.g., DataFrame, AnnData."""
    _dtype_str: str | None = CharField(max_length=64, null=True, editable=False)
    """Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.

    For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
    """
    hash: str | None = CharField(
        max_length=HASH_LENGTH, db_index=True, null=True, editable=False
    )
    """A hash of the set of feature identifiers.

    For a composite schema, the hash of hashes.
    """
    minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)
    """Whether all passed features are to be considered required by default (default `True`).

    Note that features that are explicitly marked as `optional` via `feature.with_config(optional=True)`
    are **not** required even if this `minimal_set` is true.
    """
    ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)
    """Whether features are required to be ordered (default `False`)."""
    maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)
    """Whether all features present in the dataset must be in the schema (default `False`).

    If `False`, additional features are allowed to be present in the dataset.

    If `True`, no additional features are allowed to be present in the dataset.
    """
    components: RelatedManager[Schema] = ManyToManyField(
        "self", through="SchemaComponent", symmetrical=False, related_name="composites"
    )
    """Components of this schema ← :attr:`~lamindb.Schema.composites`."""
    composites: RelatedManager[Schema]
    """The composite schemas that contains this schema as a component ← :attr:`~lamindb.Schema.components`.

    For example, an `AnnData` composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
    """
    features: RelatedManager[Feature]
    """The features contained in the schema ← :attr:`~lamindb.Feature.schemas`."""
    artifacts: RelatedManager[Artifact]
    """The artifacts with an inferred schema that matches this schema ← :attr:`~lamindb.Artifact.schemas`."""
    validated_artifacts: Artifact
    """The artifacts that were validated against this schema ← :attr:`~lamindb.Artifact.schema`."""
    projects: RelatedManager[Project]
    """Linked projects ← :attr:`~lamindb.Project.schemas`."""
    records: RelatedManager[Record]
    """Records that were annotated with this schema ← :attr:`~lamindb.Record.schema`."""
    ablocks: RelatedManager[SchemaBlock]
    """Attached blocks ← :attr:`~lamindb.SchemaBlock.schema`."""

    @overload
    def __init__(
        self,
        features: list[SQLRecord]
        | SQLRecordList
        | list[tuple[Feature, dict]]
        | None = None,
        *,
        slots: dict[str, Schema] | None = None,
        name: str | None = None,
        description: str | None = None,
        itype: str | Registry | FieldAttr | None = None,
        type: Schema | None = None,
        is_type: bool = False,
        index: Feature | None = None,
        flexible: bool | None = None,
        otype: str | None = None,
        dtype: str | Type[int | float | str] | None = None,  # noqa
        minimal_set: bool = True,
        maximal_set: bool = False,
        ordered_set: bool = False,
        coerce: bool | None = None,
        n_members: int | None = None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        if len(args) == len(self._meta.concrete_fields):
            super().__init__(*args, **kwargs)
            return None
        if len(args) > 1:
            raise ValueError("Only one non-keyword arg allowed: features")

        features: list[SQLRecord] | None = (
            args[0] if args else kwargs.pop("features", [])
        )
        index: Feature | None = kwargs.pop("index", None)
        slots: dict[str, Schema] = kwargs.pop("slots", {})
        name: str | None = kwargs.pop("name", None)
        description: str | None = kwargs.pop("description", None)
        itype: str | SQLRecord | DeferredAttribute | None = kwargs.pop("itype", None)
        flexible: bool | None = kwargs.pop("flexible", None)
        type: Feature | None = kwargs.pop("type", None)
        is_type: bool = kwargs.pop("is_type", False)
        otype: str | None = kwargs.pop("otype", None)
        dtype: str | None = kwargs.pop("dtype", None)
        minimal_set: bool = kwargs.pop("minimal_set", True)
        ordered_set: bool = kwargs.pop("ordered_set", False)
        maximal_set: bool = kwargs.pop("maximal_set", False)
        if "coerce_dtype" in kwargs:
            warnings.warn(
                "`coerce_dtype` argument was renamed to `coerce` and will be removed in a future release.",
                DeprecationWarning,
                stacklevel=2,
            )
            coerce_dtype = kwargs.pop("coerce_dtype")
        else:
            coerce_dtype = kwargs.pop("coerce", None)
        using: str | None = kwargs.pop("using", None)
        if "n" in kwargs:
            warnings.warn(
                "`n` argument was renamed to `n_members` and will be removed in a future release.",
                DeprecationWarning,
                stacklevel=2,
            )
            n_features = kwargs.pop("n")
        else:
            n_features = kwargs.pop("n_members", None)
        kwargs.pop("branch", None)
        kwargs.pop("branch_id", 1)
        kwargs.pop("space", None)
        kwargs.pop("space_id", 1)
        # backward compat
        if not slots:
            if "components" in kwargs:
                logger.warning(
                    "`components` as a keyword argument is deprecated, please use `slots` instead"
                )
                slots = kwargs.pop("components")
        if kwargs:
            valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Schema)])
            raise FieldValidationError(
                f"Only {valid_keywords} are valid keyword arguments"
            )
        (
            features,
            validated_kwargs,
            optional_features,
            features_registry,
            flexible,
        ) = self._validate_kwargs_calculate_hash(
            features=features,
            index=index,
            slots=slots,
            name=name,
            description=description,
            itype=itype,
            flexible=flexible,
            type=type,
            is_type=is_type,
            otype=otype,
            dtype=dtype,
            minimal_set=minimal_set,
            ordered_set=ordered_set,
            maximal_set=maximal_set,
            coerce=coerce_dtype,
            n_features=n_features,
        )
        if not features and not slots and not is_type and not itype:
            raise InvalidArgument(
                "Please pass features or slots or itype or set is_type=True"
            )
        if not is_type:
            schema = (
                Schema.objects.using(using)
                .filter(
                    ~Q(branch_id=-1),
                    hash=validated_kwargs["hash"],
                )
                .one_or_none()
            )
            if schema is not None:
                logger.important(f"returning schema with same hash: {schema}")
                init_self_from_db(self, schema)
                update_attributes(self, validated_kwargs)
                self.optionals.set(optional_features)
                return None
        self._slots: dict[str, Schema] = {}

        if features:
            self._features = (get_related_name(features_registry), features)  # type: ignore
        if slots:
            for slot_key, component in slots.items():
                if component._state.adding:
                    raise InvalidArgument(
                        f"schema for {slot_key} {component} must be saved before use"
                    )
            self._slots = slots

        if validated_kwargs["hash"] in KNOWN_SCHEMAS:
            validated_kwargs["uid"] = KNOWN_SCHEMAS[validated_kwargs["hash"]]
        else:
            validated_kwargs["uid"] = base62_16()

        super().__init__(**validated_kwargs)

    def query_schemas(self) -> QuerySet:
        """Query schemas of sub types.

        While `.schemas` retrieves the schemas with the current type, this method
        also retrieves sub types and the schemas with sub types of the current type.
        """
        return _query_relatives([self], "schemas")  # type: ignore

    def _validate_kwargs_calculate_hash(
        self,
        features: list[SQLRecord],
        index: Feature | None,
        slots: dict[str, Schema],
        name: str | None,
        description: str | None,
        itype: str | SQLRecord | DeferredAttribute | None,
        flexible: bool | None,
        type: Feature | None,
        is_type: bool,
        otype: str | None,
        dtype: str | None,
        minimal_set: bool,
        ordered_set: bool,
        maximal_set: bool,
        coerce: bool | None,
        n_features: int | None,
        optional_features_manual: list[Feature] | None = None,
    ) -> tuple[list[Feature], dict[str, Any], list[Feature], Registry, bool]:
        optional_features = []
        features_registry: Registry = None
        if itype is not None:
            if itype != "Composite":
                itype = serialize_dtype(itype, is_itype=True)
            else:
                warnings.warn(
                    "please do not pass the deprecated itype='Composite'", stacklevel=2
                )
        if index is not None:
            if not isinstance(index, Feature):
                raise TypeError("index must be a Feature")
            features.insert(0, index)
        if features:
            features, configs = get_features_config(features)
            features_registry = validate_features(features)
            itype_compare = features_registry.__get_name_with_module__()
            if itype is not None:
                assert itype.startswith(itype_compare), str(itype_compare)  # noqa: S101
            else:
                itype = itype_compare
            if n_features is not None:
                if n_features != len(features):
                    logger.important(f"updating to n {len(features)} features")
            n_features = len(features)
            if features_registry == Feature:
                optional_features = [
                    config[0] for config in configs if config[1].get("optional")
                ]
                if optional_features:
                    assert optional_features_manual is None  # noqa: S101
                if not optional_features and optional_features_manual is not None:
                    optional_features = optional_features_manual
        # n_features stays None if no features passed (flexible schema)
        if dtype is None:
            dtype = None if itype is not None and itype == "Feature" else NUMBER_TYPE
        else:
            dtype = get_type_str(dtype)
        if slots:
            if otype is None:
                raise InvalidArgument("Please pass otype != None for composite schemas")
        flexible_default = n_features is None
        if flexible is None:
            flexible = flexible_default
        if itype is not None and not isinstance(itype, str):
            itype_str = serialize_dtype(itype, is_itype=True)
        else:
            itype_str = itype
        validated_kwargs = {
            "name": name,
            "description": description,
            "type": type,
            "is_type": is_type,
            "_dtype_str": dtype,
            "otype": otype,
            "n_members": n_features,
            "itype": itype_str,
            "minimal_set": minimal_set,
            "ordered_set": ordered_set,
            "maximal_set": maximal_set,
            "coerce": coerce if coerce else None,
            "flexible": flexible,
        }
        n_features_default = (
            None  # None means flexible schema (no fixed number of features)
        )
        coerce_default = False
        aux_dict: dict[str, dict[str, bool | str | list[str]]] = {}

        # optional features (key "1") - remains in _aux
        if optional_features:
            aux_dict.setdefault("af", {})["1"] = [f.uid for f in optional_features]

        # index feature (key "3") - remains in _aux
        if index is not None:
            aux_dict.setdefault("af", {})["3"] = index.uid

        if aux_dict:
            validated_kwargs["_aux"] = aux_dict
        HASH_CODE = {
            "_dtype_str": "a",
            "itype": "b",
            "minimal_set": "c",
            "ordered_set": "d",
            "maximal_set": "e",
            "flexible": "f",
            "coerce_dtype": "g",
            "n": "h",
            "optional": "i",
            "features_hash": "j",
            "index": "k",
            "slots_hash": "l",
        }
        # we do not want pure informational annotations like otype, name, type, is_type, otype to be part of the hash
        hash_args = ["_dtype_str", "itype", "minimal_set", "ordered_set", "maximal_set"]
        list_for_hashing = [
            f"{HASH_CODE[arg]}={validated_kwargs[arg]}"
            for arg in hash_args
            if validated_kwargs[arg] is not None
        ]
        # only include in hash if not default so that it's backward compatible with records for which flexible was never set
        if flexible != flexible_default:
            list_for_hashing.append(f"{HASH_CODE['flexible']}={flexible}")
        if coerce is not None and coerce != coerce_default:
            list_for_hashing.append(f"{HASH_CODE['coerce_dtype']}={coerce}")
        if n_features is not None and n_features != n_features_default:
            list_for_hashing.append(f"{HASH_CODE['n']}={n_features}")
        if index is not None:
            list_for_hashing.append(f"{HASH_CODE['index']}={index.uid}")
        if features:
            if optional_features:
                feature_list_for_hashing = [
                    feature.uid
                    if feature not in set(optional_features)
                    else f"{feature.uid}({HASH_CODE['optional']})"
                    for feature in features
                ]
            else:
                feature_list_for_hashing = [feature.uid for feature in features]
            if not ordered_set:  # order matters if ordered_set is True, if not sort
                feature_list_for_hashing = sorted(feature_list_for_hashing)
            features_hash = hash_string(":".join(feature_list_for_hashing))
            list_for_hashing.append(f"{HASH_CODE['features_hash']}={features_hash}")
        if slots:
            slots_list_for_hashing = sorted(
                [f"{key}={component.hash}" for key, component in slots.items()]
            )
            slots_hash = hash_string(":".join(slots_list_for_hashing))
            list_for_hashing.append(f"{HASH_CODE['slots_hash']}={slots_hash}")

        if is_type:
            validated_kwargs["hash"] = None
        else:
            self._list_for_hashing = sorted(list_for_hashing)
            schema_hash = hash_string(":".join(self._list_for_hashing))
            validated_kwargs["hash"] = schema_hash

        return (
            features,
            validated_kwargs,
            optional_features,
            features_registry,
            flexible,
        )

    @classmethod
    def from_values(  # type: ignore
        cls,
        values: ListLike,
        field: FieldAttr = Feature.name,
        dtype: str | None = None,
        name: str | None = None,
        mute: bool = False,
        organism: SQLRecord | str | None = None,
        source: SQLRecord | None = None,
        raise_validation_error: bool = True,
    ) -> Schema:
        """Create feature set for validated features.

        Args:
            values: A list of values, like feature names or ids.
            field: The field of a reference registry to map values.
            dtype: The simple dtype.
                Defaults to `None` if reference registry is :class:`~lamindb.Feature`,
                defaults to `"float"` otherwise.
            name: A name.
            organism: An organism to resolve gene mapping.
            source: A public ontology to resolve feature identifier mapping.
            raise_validation_error: Whether to raise a validation error if some values are not valid.

        Raises:
            ValidationError: If some values are not valid.

        Example:

            ::

                import lamindb as ln
                import bionty as bt

                features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]]
                schema = ln.Schema.from_values(features)

                genes = ["ENSG00000139618", "ENSG00000198786"]
                schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float")
        """
        if not isinstance(field, FieldAttr):
            raise TypeError(
                "Argument `field` must be a SQLRecord field, e.g., `Feature.name`"
            )
        if len(values) == 0:
            raise ValueError("Provide a list of at least one value")
        if isinstance(values, DICT_KEYS_TYPE):
            values = list(values)
        registry = field.field.model
        if registry != Feature and dtype is None:
            dtype = NUMBER_TYPE
            logger.debug("setting feature set to 'number'")
        validated = registry.validate(values, field=field, mute=mute, organism=organism)
        values_array = np.array(values)
        validated_values = values_array[validated]
        if validated.sum() != len(values):
            not_validated_values = values_array[~validated]
            msg = (
                f"These values could not be validated: {not_validated_values.tolist()}\n"
                f"If there are no typos, add them to their registry: {registry.__name__}"
            )
            if raise_validation_error:
                raise ValidationError(msg)
            elif len(validated_values) == 0:
                return None  # temporarily return None here
        validated_features = registry.from_values(
            validated_values,
            field=field,
            organism=organism,
            source=source,
        )
        schema = Schema(
            features=validated_features,
            name=name,
            dtype=get_type_str(dtype),
        )
        return schema

    @classmethod
    def from_dataframe(
        cls,
        df: pd.DataFrame,
        field: FieldAttr = Feature.name,
        name: str | None = None,
        mute: bool = False,
        organism: SQLRecord | str | None = None,
        source: SQLRecord | None = None,
    ) -> Schema | None:
        """Create schema for valid columns."""
        registry = field.field.model
        validated = registry.validate(
            df.columns, field=field, mute=mute, organism=organism
        )
        if validated.sum() == 0:
            if not mute:
                logger.warning("no validated features, skip creating schema")
            return None
        if registry == Feature:
            validated_features = Feature.from_values(  # type: ignore
                df.columns, field=field, organism=organism
            )
            schema = Schema(
                list(validated_features), name=name, dtype=None, otype="DataFrame"
            )
        else:
            dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()]
            if len(set(dtypes)) != 1:
                raise ValueError(f"data types are heterogeneous: {set(dtypes)}")
            dtype = serialize_pandas_dtype(dtypes[0])
            validated_features = registry.from_values(
                df.columns[validated],
                field=field,
                organism=organism,
                source=source,
            )
            schema = Schema(
                features=list(validated_features),
                name=name,
                dtype=get_type_str(dtype),
            )
        return schema

    @classmethod
    @deprecated("from_dataframe")
    def from_df(
        cls,
        df: pd.DataFrame,
        field: FieldAttr = Feature.name,
        name: str | None = None,
        mute: bool = False,
        organism: SQLRecord | str | None = None,
        source: SQLRecord | None = None,
    ) -> Schema | None:
        return cls.from_dataframe(df, field, name, mute, organism, source)

    def save(self, *args, **kwargs) -> Schema:
        """Save schema."""
        from .save import bulk_create

        features_to_delete = []
        print_hash_mutation_warning = kwargs.pop("print_hash_mutation_warning", True)

        if self.pk is not None:
            existing_features = self.members.to_list() if self.members.exists() else []
            if hasattr(self, "_features"):
                features = self._features[1]
                if features != existing_features:
                    features_to_delete = [
                        f for f in existing_features if f not in features
                    ]
            else:
                features = existing_features
            index_feature = self.index
            index_feature_id = None if index_feature is None else index_feature.id
            _, validated_kwargs, _, _, _ = self._validate_kwargs_calculate_hash(
                features=[  # type: ignore
                    f
                    for f in features
                    if index_feature_id is None or f.id != index_feature_id
                ],
                index=index_feature,
                slots=self.slots,
                name=self.name,
                description=self.description,
                itype=self.itype,
                flexible=self.flexible,
                type=self.type,
                is_type=self.is_type,
                otype=self.otype,
                dtype=self.dtype,
                minimal_set=self.minimal_set,
                ordered_set=self.ordered_set,
                maximal_set=self.maximal_set,
                coerce=self.coerce,
                n_features=self.n_members,
                optional_features_manual=self.optionals.get(),
            )
            if validated_kwargs["hash"] != self.hash:
                from .artifact import Artifact

                datasets = Artifact.filter(schema=self)
                if datasets.exists():
                    if features_to_delete:
                        logger.warning(
                            f"you're removing these features: {features_to_delete}"
                        )
                    if print_hash_mutation_warning:
                        logger.warning(
                            f"you updated the schema hash and might invalidate datasets that were previously validated with this schema:\n{datasets.to_dataframe()}"
                        )
                self.hash = validated_kwargs["hash"]
                self.n_members = validated_kwargs["n_members"]
        super().save(*args, **kwargs)
        if hasattr(self, "_slots"):
            # analogous to save_schema_links in core._data.py
            # which is called to save feature sets in artifact.save()
            links = []
            for slot, component in self._slots.items():
                kwargs = {
                    "composite_id": self.id,
                    "component_id": component.id,
                    "slot": slot,
                }
                links.append(Schema.components.through(**kwargs))
            bulk_create(links, ignore_conflicts=True)
            delattr(self, "_slots")
        if hasattr(self, "_features"):
            assert self.n_members > 0  # noqa: S101
            using: bool | None = kwargs.pop("using", None)
            related_name, records = self._features

            # self.related_name.set(features) does **not** preserve the order
            # but orders by the feature primary key
            # hence we need the following more complicated logic
            through_model = getattr(self, related_name).through
            if self.itype == "Composite":
                related_model_split = ["Feature"]
            else:
                related_model_split = parse_cat_dtype(self.itype, is_itype=True)[
                    "registry_str"
                ].split(".")
            if len(related_model_split) == 1:
                related_field = related_model_split[0].lower()
            else:
                related_field = related_model_split[1].lower()
            related_field_id = f"{related_field}_id"
            links = [
                through_model(**{"schema_id": self.id, related_field_id: record.id})
                for record in records
            ]
            through_model.objects.using(using).bulk_create(links, ignore_conflicts=True)
            getattr(self, related_name).remove(*features_to_delete)
            delattr(self, "_features")

        return self

    @property
    def members(self) -> QuerySet:
        """A queryset for the individual records in the feature set underlying the schema.

        Unlike the many-to-many fields `schema.features`, `schema.genes`, `schema.proteins`, `.members`

            1. returns an ordered `QuerySet` if the schema is saved or a `SQLRecordList` if the schema is unsaved
            2. doesn't require knowledge of the registry storing the feature identifiers (`ln.Feature`, `bt.Gene`, `bt.Protein`, etc.)
            3. works for a dynamically created (unsaved) schema
        """
        if self._state.adding:
            # this should return a queryset and not a list...
            # need to fix this
            return SQLRecordList(self._features[1])  # type: ignore
        if self.itype == "Composite" or self.is_type:
            return Feature.objects.none()
        related_name = self._get_related_name()
        if related_name is None:
            related_name = "features"
        related_manager = self.__getattribute__(related_name)
        through_model = related_manager.through
        using = self._state.db
        related_fk_name = next(
            field.name
            for field in through_model._meta.fields
            if isinstance(field, models.ForeignKey) and field.name != "schema"
        )
        # Avoid the previous simple `order_by("links_schema__id")` on the related
        # manager: a member can be linked to many schemas, and reverse-join ordering
        # can become ambiguous across DB backends (SQLite vs Postgres). Instead, we
        # order through rows constrained to this schema and preserve that exact order.
        member_ids = list(
            through_model.objects.using(using)
            .filter(schema_id=self.id)
            .order_by("id")
            .values_list(f"{related_fk_name}_id", flat=True)
        )
        if not member_ids:
            return related_manager.model.objects.using(using).none()
        preserved_order = models.Case(
            *[
                models.When(id=member_id, then=models.Value(idx))
                for idx, member_id in enumerate(member_ids)
            ],
            output_field=models.IntegerField(),
        )
        # Order by ids from the through table constrained to this schema to avoid
        # ambiguous reverse-join ordering when a member is linked to many schemas.
        return (
            related_manager.model.objects.using(using)
            .filter(id__in=member_ids)
            .order_by(preserved_order)
        )

    @property
    def dtype(self) -> str | None:
        """The `dtype` for all features in the schema."""
        return self._dtype_str

    @dtype.setter
    def dtype(self, value: str | None) -> None:
        self._dtype_str = value

    @property
    @deprecated("coerce")
    def coerce_dtype(self) -> bool | None:
        """Alias for coerce (backward compatibility)."""
        return self.coerce

    @coerce_dtype.setter
    def coerce_dtype(self, value: bool | None) -> None:
        self.coerce = value

    @property
    @deprecated("n_members")
    def n(self) -> int | None:
        """Alias for n_members (backward compatibility)."""
        return self.n_members

    @n.setter
    def n(self, value: int | None) -> None:
        self.n_members = value

    @property
    def index(self) -> None | Feature:
        """The feature configured to act as index.

        To unset it, set `schema.index` to `None`.
        """
        if self._index_feature_uid is None:
            return None

        if hasattr(self, "_features"):
            _, features = self._features
            for feature in features:
                if feature.uid == self._index_feature_uid:
                    return feature

        return self.features.get(uid=self._index_feature_uid)

    @index.setter
    def index(self, value: None | Feature) -> None:
        if value is None:
            current_index = self.index
            self.features.remove(current_index)
            self._index_feature_uid = value
        else:
            self.features.add(value)
            self._index_feature_uid = value.uid

    @property
    def _index_feature_uid(self) -> None | str:
        """The uid of the index feature."""
        if self._aux is not None and "af" in self._aux and "3" in self._aux["af"]:
            return self._aux["af"]["3"]
        else:
            return None

    @_index_feature_uid.setter
    def _index_feature_uid(self, value: str | None) -> None:
        self._aux = self._aux or {}
        if value is None:
            self._aux.get("af", {}).pop("3")
        else:
            self._aux.setdefault("af", {})["3"] = value

    @property
    def slots(self) -> dict[str, Schema]:
        """Slots.

        Examples:

            ::

                # define composite schema
                anndata_schema = ln.Schema(
                    name="mini_immuno_anndata_schema",
                    otype="AnnData",
                    slots={"obs": obs_schema, "var": var_schema},
                ).save()

                # access slots
                anndata_schema.slots
                #> {'obs': <Schema: obs_schema>, 'var': <Schema: var_schema>}
        """
        if hasattr(self, "_slots"):
            return self._slots
        self._slots = {
            link.slot: link.component
            for link in self.components.through.filter(composite_id=self.id)
        }
        return self._slots

    @property
    def optionals(self) -> SchemaOptionals:
        """Manage optional features.

        Example:

            ::

                # a schema with optional "sample_name"
                schema_optional_sample_name = ln.Schema(
                    features=[
                        ln.Feature(name="sample_id", dtype=str).save(),  # required
                        ln.Feature(name="sample_name", dtype=str).save().with_config(optional=True),  # optional
                    ],
                ).save()

                # raise ValidationError since `sample_id` is required
                ln.curators.DataFrameCurator(
                    pd.DataFrame(
                        {
                        "sample_name": ["Sample 1", "Sample 2"],
                        }
                    ),
                    schema=schema_optional_sample_name).validate()
                )

                # passes because an optional column is missing
                ln.curators.DataFrameCurator(
                    pd.DataFrame(
                        {
                        "sample_id": ["sample1", "sample2"],
                        }
                    ),
                    schema=schema_optional_sample_name).validate()
                )
        """
        return SchemaOptionals(self)

    def add_optional_features(self, features: list[Feature]) -> None:
        """Add optional features to the schema."""
        self.features.add(*features)
        self.optionals.add(features)
        self.save(print_hash_mutation_warning=False)

    def remove_optional_features(self, features: list[Feature]) -> None:
        """Remove optional features from the schema."""
        optional_features = self.optionals.get()
        for feature in features:
            assert feature in optional_features, f"Feature {feature} is not optional"
        self.features.remove(*features)
        self.optionals.remove(features)
        self.save(print_hash_mutation_warning=False)

    @class_and_instance_method
    def describe(cls_or_self, return_str: bool = False) -> None | str:
        """Describe schema."""
        if isinstance(cls_or_self, type):
            return type(cls_or_self).describe(cls_or_self)  # type: ignore
        if cls_or_self.pk is None:
            raise ValueError("Schema must be saved before describing")
        tree = describe_schema(cls_or_self)
        for slot, schema in cls_or_self.slots.items():
            tree.add(describe_schema(schema, slot=slot))
        return format_rich_tree(tree, return_str=return_str)


def get_type_str(dtype: str | None) -> str | None:
    if dtype is not None:
        type_str = dtype.__name__ if not isinstance(dtype, str) else dtype  # type: ignore
    else:
        type_str = None
    return type_str


def _get_related_name(self: Schema) -> str | None:
    related_models = dict_related_model_to_related_name(self, instance=self._state.db)
    if self.itype:
        related_name = related_models.get(
            parse_cat_dtype(self.itype, is_itype=True)["registry_str"]
        )
        return related_name
    return None


class SchemaFeature(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_feature")
    feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_schema")

    class Meta:
        app_label = "lamindb"
        unique_together = ("schema", "feature")


class ArtifactSchema(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="_links_schema")
    schema: Schema = ForeignKey(Schema, PROTECT, related_name="_links_artifact")
    slot: str | None = CharField(null=True)
    feature_ref_is_semantic: bool | None = BooleanField(null=True)

    class Meta:
        app_label = "lamindb"
        unique_together = (("artifact", "schema"), ("artifact", "slot"))


class SchemaComponent(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    composite: Schema = ForeignKey(Schema, CASCADE, related_name="links_component")
    component: Schema = ForeignKey(Schema, PROTECT, related_name="links_composite")
    slot: str | None = CharField(null=True)

    class Meta:
        app_label = "lamindb"
        unique_together = (("composite", "slot", "component"), ("composite", "slot"))


Schema._get_related_name = _get_related_name


# PostgreSQL migration helpers for auxiliary fields
# These are used by migrations to efficiently migrate data from _aux to Django fields


def migrate_auxiliary_fields_postgres(schema_editor) -> None:
    """Migrate _aux['af'] fields to Django fields using PostgreSQL raw SQL.

    This efficiently migrates auxiliary fields for all affected models:

    **Artifact:**
    - _save_completed from _aux['af']['0']

    **Run:**
    - cli_args from _aux['af']['0']

    **Feature:**
    - default_value from _aux['af']['0']
    - nullable from _aux['af']['1'] (default: True)
    - coerce from _aux['af']['2'] (default: False)
    - For type features (is_type=True), all values are set to NULL

    **Schema:**
    - coerce from _aux['af']['0']
    - flexible from _aux['af']['2'] (or computed from n_members)
    - n_members (converted from negative to NULL)
    - For type schemas (is_type=True), all values are set to NULL
    - Keys '1' (optionals) and '3' (index_feature_uid) are preserved in _aux
    """
    # Artifact: migrate _save_completed from _aux->'af'->'0'
    schema_editor.execute("""
        UPDATE lamindb_artifact
        SET _save_completed = (_aux->'af'->>'0')::boolean,
            _aux = CASE
                WHEN _aux->'af' IS NOT NULL THEN
                    CASE
                        WHEN _aux - 'af' = '{}'::jsonb THEN NULL
                        ELSE _aux - 'af'
                    END
                ELSE _aux
            END
        WHERE _aux IS NOT NULL AND _aux->'af' IS NOT NULL
    """)

    # Run: migrate cli_args from _aux->'af'->'0'
    schema_editor.execute("""
        UPDATE lamindb_run
        SET cli_args = _aux->'af'->>'0',
            _aux = CASE
                WHEN _aux - 'af' = '{}'::jsonb THEN NULL
                ELSE _aux - 'af'
            END
        WHERE _aux IS NOT NULL AND _aux ? 'af'
    """)

    # Feature: migrate default_value, nullable, coerce
    # For type features: set all to NULL
    schema_editor.execute("""
        UPDATE lamindb_feature
        SET default_value = NULL,
            nullable = NULL,
            coerce = NULL,
            _aux = CASE
                WHEN _aux->'af' IS NOT NULL THEN
                    CASE
                        WHEN _aux - 'af' = '{}'::jsonb THEN NULL
                        ELSE _aux - 'af'
                    END
                ELSE _aux
            END
        WHERE is_type = TRUE
    """)
    # For regular features: migrate values with defaults
    schema_editor.execute("""
        UPDATE lamindb_feature
        SET default_value = _aux->'af'->'0',
            nullable = COALESCE((_aux->'af'->>'1')::boolean, TRUE),
            coerce = COALESCE((_aux->'af'->>'2')::boolean, FALSE),
            _aux = CASE
                WHEN _aux->'af' IS NOT NULL THEN
                    CASE
                        WHEN _aux - 'af' = '{}'::jsonb THEN NULL
                        ELSE _aux - 'af'
                    END
                ELSE _aux
            END
        WHERE is_type = FALSE OR is_type IS NULL
    """)

    # Schema: migrate coerce, flexible, n_members
    # For type schemas: set all to NULL
    schema_editor.execute("""
        UPDATE lamindb_schema
        SET coerce = NULL,
            flexible = NULL,
            n_members = NULL,
            _aux = CASE
                WHEN _aux->'af' IS NOT NULL THEN
                    CASE
                        WHEN ((_aux->'af') #- ARRAY['0'] #- ARRAY['2']) = '{}'::jsonb THEN
                            CASE WHEN (_aux #- ARRAY['af']) = '{}'::jsonb THEN NULL ELSE _aux #- ARRAY['af'] END
                        ELSE jsonb_set(_aux #- ARRAY['af'], '{af}', (_aux->'af') #- ARRAY['0'] #- ARRAY['2'])
                    END
                ELSE _aux
            END
        WHERE is_type = TRUE
    """)
    # For regular schemas: migrate values
    # Keep '1' (optionals) and '3' (index_feature_uid) in _aux
    schema_editor.execute("""
        UPDATE lamindb_schema
        SET coerce = (_aux->'af'->>'0')::boolean,
            flexible = COALESCE(
                (_aux->'af'->>'2')::boolean,
                n_members IS NULL OR n_members < 0
            ),
            n_members = CASE WHEN n_members < 0 THEN NULL ELSE n_members END,
            _aux = CASE
                WHEN _aux->'af' IS NOT NULL THEN
                    CASE
                        WHEN ((_aux->'af') #- ARRAY['0'] #- ARRAY['2']) = '{}'::jsonb THEN
                            CASE WHEN (_aux #- ARRAY['af']) = '{}'::jsonb THEN NULL ELSE _aux #- ARRAY['af'] END
                        ELSE jsonb_set(
                            CASE WHEN (_aux #- ARRAY['af']) = '{}'::jsonb THEN '{}'::jsonb ELSE _aux #- ARRAY['af'] END,
                            '{af}',
                            (_aux->'af') #- ARRAY['0'] #- ARRAY['2']
                        )
                    END
                ELSE _aux
            END
        WHERE is_type = FALSE OR is_type IS NULL
    """)


================================================
FILE: lamindb/models/sqlrecord.py
================================================
from __future__ import annotations

import builtins
import gzip
import inspect
import os
import re
import shutil
import sys
from collections import defaultdict
from itertools import chain
from pathlib import Path
from typing import (
    TYPE_CHECKING,
    Any,
    Literal,
    NamedTuple,
    TypeVar,
    overload,
)

import dj_database_url
import lamindb_setup as ln_setup
from django.core.exceptions import ValidationError as DjangoValidationError
from django.db import IntegrityError, ProgrammingError, connections, models, transaction
from django.db.models import CASCADE, DEFERRED, PROTECT, Field, Manager, QuerySet
from django.db.models import ForeignKey as django_ForeignKey
from django.db.models.base import ModelBase
from django.db.models.fields.related import (
    ManyToManyField,
    ManyToManyRel,
    ManyToOneRel,
)
from django.db.models.functions import Lower
from lamin_utils import colors, logger
from lamindb_setup import settings as setup_settings
from lamindb_setup._connect_instance import (
    INSTANCE_NOT_FOUND_MESSAGE,
    InstanceNotFoundError,
    get_owner_name_from_identifier,
    load_instance_settings,
    update_db_using_local,
)
from lamindb_setup.core._docs import doc_args
from lamindb_setup.core._hub_core import connect_instance_hub
from lamindb_setup.core._settings_store import instance_settings_file
from lamindb_setup.core.django import DBToken, db_token_manager
from upath import UPath

from lamindb.base.users import current_user_id
from lamindb.base.utils import class_and_instance_method, deprecated

from ..base.fields import (
    BooleanField,
    CharField,
    DateTimeField,
    ForeignKey,
    JSONField,
    TextField,
)
from ..base.types import (
    BRANCH_CODE_TO_STATUS,
    BRANCH_STATUS_TO_CODE,
    BranchStatus,
    FieldAttr,
    StrField,
)
from ..base.uids import base62_12
from ..errors import (
    FieldValidationError,
    NoWriteAccess,
    ValidationError,
)
from ._is_versioned import IsVersioned, _adjust_is_latest_when_deleting_is_versioned
from .query_manager import QueryManager, _lookup, _search

if TYPE_CHECKING:
    from datetime import datetime

    import pandas as pd

    from .block import BranchBlock, SpaceBlock
    from .project import Project
    from .query_manager import RelatedManager
    from .query_set import SQLRecordList
    from .run import Run, User
    from .ulabel import ULabel


T = TypeVar("T", bound="SQLRecord")
IPYTHON = getattr(builtins, "__IPYTHON__", False)
UNIQUE_FIELD_NAMES = {
    "root",
    "ontology_id",
    "uid",
    "scientific_name",
    "ensembl_gene_id",
    "uniprotkb_id",
}
BRANCH_SENSITIVE_BLOCK_MODEL_NAMES = frozenset(
    {
        "RecordBlock",
        "ArtifactBlock",
        "TransformBlock",
        "CollectionBlock",
        "RunBlock",
        "SchemaBlock",
        "FeatureBlock",
        "ProjectBlock",
        "ULabelBlock",
        "SpaceBlock",
    }
)


def _is_branch_sensitive_model(model: type[BaseSQLRecord]) -> bool:
    return (
        issubclass(model, SQLRecord) and model.__name__ not in {"Storage", "Source"}
    ) or model.__name__ in BRANCH_SENSITIVE_BLOCK_MODEL_NAMES


# -------------------------------------------------------------------------------------
# A note on required fields at the SQLRecord level
#
# As Django does most of its validation on the Form-level, it doesn't offer functionality
# for validating the integrity of an SQLRecord object upon instantation (similar to pydantic)
#
# For required fields, we define them as commonly done on the SQL level together
# with a validator in SQLRecord (validate_required_fields)
#
# This goes against the Django convention, but goes with the SQLModel convention
# (Optional fields can be null on the SQL level, non-optional fields cannot)
#
# Due to Django's convention where CharFieldAttr has pre-configured (null=False, default=""), marking
# a required field necessitates passing `default=None`. Without the validator it would trigger
# an error at the SQL-level, with it, it triggers it at instantiation

# -------------------------------------------------------------------------------------
# A note on class and instance methods of core SQLRecord
#
# All of these are defined and tested within lamindb, in files starting with _{orm_name}.py

# -------------------------------------------------------------------------------------
# A note on maximal lengths of char fields
#
# 100 characters:
#     "Raindrops pitter-pattered on the windowpane, blurring the"
#     "city lights outside, curled up with a mug."
# A good maximal length for a name (title).
#
# 150 characters: We choose this for name maximal length because some users like long names.
#
# 255 characters:
#     "In creating a precise 255-character paragraph, one engages in"
#     "a dance of words, where clarity meets brevity. Every syllable counts,"
#     "illustrating the skill in compact expression, ensuring the essence of the"
#     "message shines through within the exacting limit."


class IsLink:
    pass


class HasType(models.Model):
    """Mixin for registries that have a hierarchical `type` assigned.

    Such registries have a `.type` foreign key pointing to themselves.

    A `type` hence allows hierarchically grouping records under types.

    For instance, using the example of `ln.Record`::

        experiment_type = ln.Record(name="Experiment", is_type=True).save()
        experiment1 = ln.Record(name="Experiment 1", type=experiment_type).save()
        experiment2 = ln.Record(name="Experiment 2", type=experiment_type).save()
    """

    class Meta:
        abstract = True

    is_type: bool = BooleanField(default=False, db_default=False, db_index=True)
    """Indicates if record is a `type`.

    For example, if a record "Compound" is a `type`, the actual compounds "darerinib", "tramerinib", would be instances of that `type`.
    """

    def query_types(self) -> SQLRecordList:
        """Query types of a record recursively.

        While `.type` retrieves the `type`, this method
        retrieves all super types of that `type`::

            # Create type hierarchy
            type1 = model_class(name="Type1", is_type=True).save()
            type2 = model_class(name="Type2", is_type=True, type=type1).save()
            type3 = model_class(name="Type3", is_type=True, type=type2).save()

            # Create a record with type3
            record = model_class(name=f"{model_name}3", type=type3).save()

            # Query super types
            super_types = record.query_types()
            assert super_types[0] == type3
            assert super_types[1] == type2
            assert super_types[2] == type1
        """
        from .has_parents import _query_ancestors_of_fk

        return _query_ancestors_of_fk(self, "type")  # type: ignore


def deferred_attribute__repr__(self):
    return f"FieldAttr({self.field.model.__name__}.{self.field.name})"


def unique_constraint_error_in_error_message(error_msg: str) -> bool:
    """Check if the error message indicates a unique constraint violation."""
    return (
        "UNIQUE constraint failed" in error_msg  # SQLite
        or "duplicate key value violates unique constraint" in error_msg  # Postgre
    )


def parse_violated_field_from_error_message(error_msg: str) -> list[str] | None:
    # Even if the model has multiple fields with unique=True,
    # Django will only raise an IntegrityError for one field at a time
    # - whichever constraint is violated first during the database insert/update operation.
    if unique_constraint_error_in_error_message(error_msg):
        if "UNIQUE constraint failed" in error_msg:  # sqlite
            constraint_field = (
                error_msg.removeprefix("UNIQUE constraint failed: ")
                .split(", ")[0]
                .split(".")[-1]
            )
            return [constraint_field]
        else:  # postgres
            # Extract constraint name from double quotes
            constraint_name = error_msg.split('"')[1]

            # Check if it's a multi-column constraint (contains multiple field names)
            # Format: tablename_field1_field2_..._hash_uniq
            if "_uniq" in constraint_name:
                # Remove '_uniq' suffix first
                constraint_name = constraint_name.removesuffix("_uniq")

                # Remove hash (8 hex characters at the end)
                parts = constraint_name.split("_")
                if len(parts[-1]) == 8 and all(
                    c in "0123456789abcdef" for c in parts[-1]
                ):
                    constraint_name = "_".join(parts[:-1])

                # Remove table name prefix (e.g., "bionty_ethnicity_")
                # Table name is typically the first 2 parts for app_model format
                parts = constraint_name.split("_")
                if len(parts) > 2:
                    # Assume first 2 parts are table name (e.g., "bionty_ethnicity")
                    field_string = "_".join(parts[2:])
                else:
                    field_string = constraint_name

                # Now parse the fields from DETAIL line
                # DETAIL: Key (name, ontology_id)=(South Asian, HANCESTRO:0006) already exists.
                if "Key (" in error_msg:
                    fields_part = error_msg.split("Key (")[1].split(")=")[0]
                    fields = [f.strip() for f in fields_part.split(",")]
                    return fields

                # Fallback if DETAIL line not available
                return [field_string]
            else:
                # Single field constraint (ends with _key)
                constraint_field = constraint_name.removesuffix("_key").split("_")[-1]
                return [constraint_field]

    return None


FieldAttr.__repr__ = deferred_attribute__repr__  # type: ignore


class ValidateFields:
    pass


def is_approx_pascal_case(s: str) -> bool:
    """Check if the last component of a dotted string is in PascalCase.

    Args:
        s: The string to check
    """
    if "[" in s:  # this is because we allow types of form 'script[test_script.py]'
        return True
    last_component = s.split(".")[-1]

    return last_component[:1].isupper() and "_" not in last_component


def init_self_from_db(self: SQLRecord, existing_record: SQLRecord):
    from .run import current_run

    new_args = [
        getattr(existing_record, field.attname) for field in self._meta.concrete_fields
    ]
    super(self.__class__, self).__init__(*new_args)
    self._state.adding = False  # mimic from_db
    self._state.db = "default"
    # if run was not set on the existing record, set it to the current_run
    if hasattr(self, "run_id") and self.run_id is None and current_run() is not None:
        logger.warning(f"run was not set on {self}, setting to current run")
        self.run = current_run()


def update_attributes(record: SQLRecord, attributes: dict[str, str]):
    for key, value in attributes.items():
        if getattr(record, key) != value and value is not None:
            if key not in {"uid", "_dtype_str", "otype", "hash"}:
                logger.warning(f"updated {key} from {getattr(record, key)} to {value}")
                setattr(record, key, value)
            else:
                hash_message = (
                    "recomputing on .save()"
                    if key == "hash"
                    else f"keeping {getattr(record, key)}"
                )
                logger.debug(
                    f"ignoring tentative value {value} for {key}, {hash_message}"
                )


def validate_literal_fields(record: SQLRecord, kwargs) -> None:
    """Validate all Literal type fields in a record.

    Args:
        record: record being validated

    Raises:
        ValidationError: If any field value is not in its Literal's allowed values
    """
    if isinstance(record, IsLink):
        return None
    if record.__class__.__name__ in "Feature":
        return None
    from lamindb.base.types import ArtifactKind, Dtype, TransformKind

    types = {
        "TransformKind": TransformKind,
        "ArtifactKind": ArtifactKind,
        "Dtype": Dtype,
    }
    errors = {}
    annotations = getattr(record.__class__, "__annotations__", {})
    for field_name, annotation in annotations.items():
        if field_name not in kwargs or kwargs[field_name] is None:
            continue
        value = kwargs[field_name]
        if str(annotation) in types:
            annotation = types[annotation]
        if not hasattr(annotation, "__origin__"):
            continue
        literal_type = annotation if annotation.__origin__ is Literal else None
        if literal_type is None:
            continue
        valid_values = set(literal_type.__args__)
        if value not in valid_values:
            errors[field_name] = (
                f"{field_name}: {colors.yellow(value)} is not a valid value"
                f"\n    → Valid values are: {colors.green(', '.join(sorted(valid_values)))}"
            )
    if errors:
        message = "\n  "
        for _, error in errors.items():
            message += error + "\n  "
        raise FieldValidationError(message)


def validate_fields(record: SQLRecord, kwargs):
    from lamindb.models import (
        Artifact,
        Collection,
        Feature,
        Run,
        Schema,
        Transform,
        ULabel,
    )

    # validate required fields
    # a "required field" is a Django field that has `null=False, default=None`
    required_fields = {
        k.name for k in record._meta.fields if not k.null and k.default is None
    }
    required_fields_not_passed = {k: None for k in required_fields if k not in kwargs}
    kwargs.update(required_fields_not_passed)
    missing_fields = [
        k for k, v in kwargs.items() if v is None and k in required_fields
    ]
    if missing_fields:
        raise FieldValidationError(f"{missing_fields} are required.")
    # ensure the exact length of the internal uid for core entities
    if "uid" in kwargs and record.__class__ in {
        Artifact,
        Collection,
        Transform,
        Run,
        ULabel,
        Feature,
        Schema,
    }:
        uid_max_length = record.__class__._meta.get_field(
            "uid"
        ).max_length  # triggers FieldDoesNotExist
        if len(kwargs["uid"]) != uid_max_length:  # triggers KeyError
            if not (
                record.__class__ is Schema and len(kwargs["uid"]) == 16
            ):  # no error for schema
                raise ValidationError(
                    f"`uid` must be exactly {uid_max_length} characters long, got {len(kwargs['uid'])}."
                )
    # validate is_type
    if "is_type" in kwargs and "name" in kwargs and kwargs["is_type"]:
        is_approx_pascal_case(kwargs["name"])
    if (
        "type" in kwargs
        and isinstance(kwargs["type"], HasType)
        and not kwargs["type"].is_type
    ):
        object_name = record.__class__.__name__.lower()
        raise ValueError(
            f"You can only assign a {object_name} with `is_type=True` as `type` to another {object_name}, but this doesn't have it: {kwargs['type']}"
        )
    # validate literals
    validate_literal_fields(record, kwargs)


def suggest_records_with_similar_names(
    record: SQLRecord, name_field: str, kwargs
) -> SQLRecord | None:
    """Returns a record if found exact match, otherwise None.

    Logs similar matches if found.
    """
    if kwargs.get(name_field) is None or not isinstance(kwargs.get(name_field), str):
        return None
    # need to perform an additional request to find the exact match
    # previously, this was inferred from the truncated/fuzzy search below
    # but this isn't reliable: https://laminlabs.slack.com/archives/C04FPE8V01W/p1737812808563409
    # the below needs to be .first() because there might be multiple records with the same
    # name field in case the record is versioned (e.g. for Transform key)
    if isinstance(record, HasType):
        if kwargs.get("type", None) is None:
            subset = record.__class__.filter(type__isnull=True)
        else:
            subset = record.__class__.filter(type=kwargs["type"])
    else:
        subset = record.__class__
    exact_match = subset.filter(**{name_field: kwargs[name_field]}).first()
    if exact_match is not None:
        return exact_match
    queryset = _search(
        subset,
        kwargs[name_field],
        field=name_field,
        truncate_string=True,
        limit=3,
    )
    if not queryset.exists():  # empty queryset
        return None
    s, it, nots, record_text = (
        ("", "it", "s", "a record")
        if len(queryset) == 1
        else ("s", "one of them", "", "records")
    )
    similar_names = ", ".join(f"'{getattr(record, name_field)}'" for record in queryset)
    msg = f"you are trying to create a record with name='{kwargs[name_field]}' but {record_text} with similar {name_field}{s} exist{nots}: {similar_names}. Did you mean to load {it}?"
    logger.warning(f"{msg}")

    return None


def delete_record(record: BaseSQLRecord, is_soft: bool = True):
    def delete():
        if is_soft:
            record.branch_id = -1
            record.save()
            return None
        else:
            return super(BaseSQLRecord, record).delete()

    # deal with versioned records
    # if _overwrite_versions = True, there is only a single version and
    # no need to set the new latest version because all versions are deleted
    # when deleting the latest version
    if (
        isinstance(record, IsVersioned)
        and record.is_latest
        and not getattr(record, "_overwrite_versions", False)
    ):
        promoted = _adjust_is_latest_when_deleting_is_versioned(record)
        if promoted:
            if is_soft:
                record.is_latest = False
            with transaction.atomic():
                result = delete()
            return result
    # deal with all other cases of the nested if condition now
    return delete()


RECORD_REGISTRY_EXAMPLE = """Example::

        from lamindb import SQLRecord, fields

        # sub-classing `SQLRecord` creates a new registry
        class Experiment(SQLRecord):
            name: str = fields.CharField()

        # instantiating `Experiment` creates a record `experiment`
        experiment = Experiment(name="my experiment")

        # you can save the record to the database
        experiment.save()

        # `Experiment` refers to the registry, which you can query
        df = Experiment.filter(name__startswith="my ").to_dataframe()
"""


def _synchronize_clone(storage_root: str) -> str | None:
    """Synchronizes a clone to the local SQLite path.

    Args:
        storage_root: The storage root path of the (target) instance
    """
    cloud_db_path = UPath(storage_root) / ".lamindb" / "lamin.db"
    local_sqlite_path = ln_setup.settings.cache_dir / cloud_db_path.path.lstrip("/")

    local_sqlite_path.parent.mkdir(parents=True, exist_ok=True)
    cloud_db_path_gz = UPath(str(cloud_db_path) + ".gz", anon=True)
    local_sqlite_path_gz = Path(str(local_sqlite_path) + ".gz")

    try:
        if cloud_db_path_gz.synchronize_to(
            local_sqlite_path_gz, error_no_origin=True, print_progress=True
        ):
            with (
                gzip.open(local_sqlite_path_gz, "rb") as f_in,
                open(local_sqlite_path, "wb") as f_out,
            ):
                shutil.copyfileobj(f_in, f_out)
        return f"sqlite:///{local_sqlite_path}"
    except (FileNotFoundError, PermissionError):
        logger.debug("Clone not found. Falling back to normal access...")
        return None


# this is the metaclass for SQLRecord
@doc_args(RECORD_REGISTRY_EXAMPLE)
class Registry(ModelBase):
    """Metaclass for :class:`~lamindb.models.SQLRecord`.

    Each `Registry` *object* is a `SQLRecord` *class* and corresponds to a table in the metadata SQL database.

    You work with `Registry` objects whenever you use *class methods* of `SQLRecord`.

    You call any subclass of `SQLRecord` a "registry" and their objects "records". A `SQLRecord` object corresponds to a row in the SQL table.

    If you want to create a new registry, you sub-class `SQLRecord`.

    {}

    Note: `Registry` inherits from Django's `ModelBase`.
    """

    _available_fields: set[str] = None

    def __new__(cls, name, bases, attrs, **kwargs):
        new_class = super().__new__(cls, name, bases, attrs, **kwargs)
        return new_class

    # below creates a sensible auto-complete behavior that differs across the
    # class and instance level in Jupyter Editors it doesn't have any effect for
    # static type analyzer like pylance used in VSCode
    def __dir__(cls):
        # this is needed to bring auto-complete on the class-level back
        # https://laminlabs.slack.com/archives/C04FPE8V01W/p1717535625268849
        # Filter class attributes, excluding instance methods
        exclude_instance_methods = "sphinx" not in sys.modules
        # https://laminlabs.slack.com/archives/C04FPE8V01W/p1721134595920959

        def include_attribute(attr_name, attr_value):
            if attr_name.startswith("__"):
                return False
            if exclude_instance_methods and callable(attr_value):
                return isinstance(attr_value, (classmethod, staticmethod, type))
            return True

        # check also inherited attributes
        if hasattr(cls, "mro"):
            attrs = chain(*(c.__dict__.items() for c in cls.mro()))
        else:
            attrs = cls.__dict__.items()

        result = []
        for attr_name, attr_value in attrs:
            if attr_name not in result and include_attribute(attr_name, attr_value):
                result.append(attr_name)

        # Add non-dunder attributes from Registry
        for attr in dir(Registry):
            if not attr.startswith("__") and attr not in result:
                result.append(attr)
        return result

    def describe(cls, return_str: bool = False) -> str | None:
        """Describe the fields of the registry."""
        from ._describe import strip_ansi_from_string as _strip_ansi

        repr_str = f"{colors.green(cls.__name__)}\n"
        info = SQLRecordInfo(cls)
        repr_str += info.get_simple_fields(return_str=True)
        repr_str += info.get_relational_fields(return_str=True)
        repr_str = repr_str.rstrip("\n")
        if return_str:
            return _strip_ansi(repr_str)
        else:
            print(repr_str)
            return None

    @doc_args(_lookup.__doc__)
    def lookup(
        cls,
        field: StrField | None = None,
        return_field: StrField | None = None,
        keep: Literal["first", "last", False] = "first",
    ) -> NamedTuple:
        """{}"""  # noqa: D415
        return _lookup(cls=cls, field=field, return_field=return_field, keep=keep)

    def filter(cls, *queries, **expressions) -> QuerySet:
        """Query records.

        Args:
            queries: One or multiple `Q` objects.
            expressions: Fields and values passed as Django query expressions.

        See Also:
            - Guide: :doc:`docs:registries`
            - Django documentation: `Queries <https://docs.djangoproject.com/en/stable/topics/db/queries/>`__

        Examples:
            >>> ln.Project(name="my label").save()
            >>> ln.Project.filter(name__startswith="my").to_dataframe()
        """
        from .query_set import QuerySet

        _using_key = None
        if "_using_key" in expressions:
            _using_key = expressions.pop("_using_key")

        return QuerySet(model=cls, using=_using_key).filter(*queries, **expressions)

    def get(
        cls: type[T],
        idlike: int | str | None = None,
        **expressions,
    ) -> T:
        """Get a single record.

        Args:
            idlike: Either a uid stub, uid or an integer id.
            expressions: Fields and values passed as Django query expressions.

        Raises:
            :exc:`lamindb.errors.ObjectDoesNotExist`: In case no matching record is found.

        See Also:
            - Guide: :doc:`registries`
            - Django documentation: `Queries <https://docs.djangoproject.com/en/stable/topics/db/queries/>`__

        Examples:

            ::

                record = ln.Record.get("FvtpPJLJ")
                record = ln.Record.get(name="my-label")
        """
        from .query_set import QuerySet

        return QuerySet(model=cls).get(idlike, **expressions)

    def to_dataframe(
        cls,
        *,
        include: str | list[str] | None = None,
        features: str | list[str] | None = None,
        limit: int | None = 100,
        order_by: str | None = "-id",
    ) -> pd.DataFrame:
        """Evaluate and convert to `pd.DataFrame`.

        By default, this returns up to 100 rows for a fast overview.
        Pass `limit=None` to fetch all matching records.

        By default, maps simple fields and foreign keys onto `DataFrame` columns.

        Guide: :doc:`docs:registries`

        Args:
            include: Related data to include as columns. Takes strings of
                form `"records__name"`, `"cell_types__name"`, etc. or a list
                of such strings. For `Artifact`, `Record`, and `Run`, can also pass `"features"`
                to include features with data types pointing to entities in the core schema.
                If `"privates"`, includes private fields (fields starting with `_`).
            features: Configure the features to include. Can be a feature name or a list of such names.
                If `"queryset"`, infers the features used within the current queryset.
                Only available for `Artifact`, `Record`, and `Run`.
            limit: Maximum number of rows to display. Defaults to 100. If `None`,
                includes all results.
            order_by: Field name to order the records by. Prefix with '-' for descending order.
                Defaults to '-id' to get the most recent records. This argument is ignored
                if the queryset is already ordered or if the specified field does not exist.

        Examples:

            Include the name of the creator::

                ln.Record.to_dataframe(include="created_by__name"])

            Include features::

                ln.Artifact.to_dataframe(include="features")

            Include selected features::

                ln.Artifact.to_dataframe(features=["cell_type_by_expert", "cell_type_by_model"])
        """
        return cls.filter().to_dataframe(
            include=include, features=features, order_by=order_by, limit=limit
        )

    @deprecated(new_name="to_dataframe")
    def df(
        cls,
        *,
        include: str | list[str] | None = None,
        features: str | list[str] | None = None,
        limit: int | None = 100,
        order_by: str | None = "-id",
    ) -> pd.DataFrame:
        return cls.to_dataframe(
            include=include, features=features, limit=limit, order_by=order_by
        )

    @doc_args(_search.__doc__)
    def search(
        cls,
        string: str,
        *,
        field: StrField | None = None,
        limit: int | None = 20,
        case_sensitive: bool = False,
    ) -> QuerySet:
        """{}"""  # noqa: D415
        return _search(
            cls=cls,
            string=string,
            field=field,
            limit=limit,
            case_sensitive=case_sensitive,
        )

    @deprecated(new_name="connect")
    def using(
        cls,
        instance: str | None,
    ) -> QuerySet:
        return cls.connect(
            instance=instance,
        )

    def connect(
        cls,
        instance: str | None,
    ) -> QuerySet:
        """Query a non-default LaminDB instance.

        Args:
            instance: An instance identifier of form "account_handle/instance_name".

        Examples:

            ::

                ln.Record.connect("account_handle/instance_name").search("label7", field="name")
        """
        from .query_set import QuerySet

        # we're in the default instance
        if instance is None or instance == "default":
            return QuerySet(model=cls, using=None)
        # connection already established
        if instance in connections:
            return QuerySet(model=cls, using=instance)

        owner, name = get_owner_name_from_identifier(instance)
        current_instance_owner_name: list[str] = setup_settings.instance.slug.split("/")

        # move on to different instances
        cache_using_filepath = (
            setup_settings.cache_dir / f"instance--{owner}--{name}--uid.txt"
        )
        settings_file = instance_settings_file(name, owner)
        if not settings_file.exists():
            result = connect_instance_hub(owner=owner, name=name)
            if isinstance(result, str):
                message = INSTANCE_NOT_FOUND_MESSAGE.format(
                    owner=owner, name=name, hub_result=result
                )
                raise InstanceNotFoundError(message)
            iresult, storage = result
            # this can happen if querying via an old instance name
            if [iresult.get("owner"), iresult["name"]] == current_instance_owner_name:
                return QuerySet(model=cls, using=None)
            # do not use {} syntax below, it gives rise to a dict if the schema modules
            # are empty and then triggers a TypeError in missing_members = source_modules - target_modules
            source_modules = set(  # noqa
                [mod for mod in iresult["schema_str"].split(",") if mod != ""]
            )

            # Try to connect to a clone if targeting a public instance but fall back to normal access if access failed
            db = None
            if (
                "_public" in iresult["db_user_name"]
                and "postgresql" in iresult["db_scheme"]
            ):
                db = _synchronize_clone(storage["root"])
            if db is None:
                if [
                    iresult.get("owner"),
                    iresult["name"],
                ] == current_instance_owner_name:
                    return QuerySet(model=cls, using=None)
                db = update_db_using_local(iresult, settings_file)
                is_fine_grained_access = (
                    iresult["fine_grained_access"]
                    and iresult["db_permissions"] == "jwt"
                )
            else:
                is_fine_grained_access = False

            cache_using_filepath.write_text(
                f"{iresult['lnid']}\n{iresult['schema_str']}", encoding="utf-8"
            )

            # access_db can take both: the dict from connect_instance_hub and isettings
            into_db_token = iresult
        else:
            isettings = load_instance_settings(settings_file)
            source_modules = isettings.modules
            db = None
            if "public" in isettings.db and isettings.dialect == "postgresql":
                db = _synchronize_clone(isettings.storage.root_as_str)

            # Try to connect to a clone if targeting a public instance but fall back to normal access if access failed
            if db is None:
                if [isettings.owner, isettings.name] == current_instance_owner_name:
                    return QuerySet(model=cls, using=None)
                db = isettings.db
                is_fine_grained_access = (
                    isettings._fine_grained_access
                    and isettings._db_permissions == "jwt"
                )
            else:
                is_fine_grained_access = False

            cache_using_filepath.write_text(
                f"{isettings.uid}\n{','.join(source_modules)}", encoding="utf-8"
            )
            # access_db can take both: the dict from connect_instance_hub and isettings
            into_db_token = isettings

        target_modules = setup_settings.instance.modules
        if missing_members := source_modules - target_modules:
            logger.info(
                f"in transfer, source lamindb instance has additional modules: {', '.join(missing_members)}"
            )

        add_db_connection(db, instance)
        if is_fine_grained_access:
            db_token = DBToken(into_db_token)
            db_token_manager.set(db_token, instance)

        return QuerySet(model=cls, using=instance)

    def __get_module_name__(cls) -> str:
        schema_module_name = cls.__module__.split(".")[0]
        module_name = schema_module_name.replace("lnschema_", "")
        if module_name == "lamindb":
            module_name = "core"
        return module_name

    def __get_name_with_module__(cls) -> str:
        module_name = cls.__get_module_name__()
        if module_name == "core":
            module_prefix = ""
        else:
            module_prefix = f"{module_name}."
        return f"{module_prefix}{cls.__name__}"

    def __get_available_fields__(cls) -> set[str]:
        if cls._available_fields is None:
            available_fields = set()
            for field in cls._meta.get_fields():
                if not (field_name := field.name).startswith(("_", "links_")):
                    available_fields.add(field_name)
                    if isinstance(field, django_ForeignKey):
                        available_fields.add(field_name + "_id")
            if cls.__name__ == "Artifact":
                available_fields.add("transform")
                available_fields.add("feature_sets")  # backward compat with lamindb v1
            cls._available_fields = available_fields
        return cls._available_fields


class BaseSQLRecord(models.Model, metaclass=Registry):
    """Base SQL metadata record.

    It provides methods to `SQLRecord` and all its subclasses,
    but doesn't come with the additional `branch` and `space` fields.
    """

    objects = QueryManager()

    class Meta:
        abstract = True
        base_manager_name = "objects"

    # fields to track for changes
    # if not None, will be tracked in self._original_values as {field_name: value}
    # use _id fields for foreign keys
    _TRACK_FIELDS: tuple[str, ...] | None = None

    def __init__(self, *args, **kwargs):
        skip_validation = kwargs.pop("_skip_validation", False)
        if not args:
            if not os.getenv("LAMINDB_MULTI_INSTANCE") == "true":
                if (
                    issubclass(self.__class__, SQLRecord)
                    and self.__class__.__name__ != "Storage"
                    # do not save bionty entities in restricted spaces by default
                    and self.__class__.__module__ != "bionty.models"
                ):
                    from lamindb import context as run_context

                    if run_context.space is not None:
                        current_space = run_context.space
                    elif setup_settings.space is not None:
                        current_space = setup_settings.space

                    if current_space is not None:
                        if "space_id" in kwargs:
                            # space_id takes precedence over space
                            # https://claude.ai/share/f045e5dc-0143-4bc5-b8a4-38309229f75e
                            if kwargs["space_id"] == 1:  # ignore default space
                                kwargs.pop("space_id")
                                kwargs["space"] = current_space
                        elif "space" in kwargs:
                            if kwargs["space"] is None:
                                kwargs["space"] = current_space
                        else:
                            kwargs["space"] = current_space
                if _is_branch_sensitive_model(self.__class__):
                    from lamindb import context as run_context

                    if run_context.branch is not None:
                        current_branch = run_context.branch
                    elif setup_settings.branch is not None:
                        current_branch = setup_settings.branch

                    if current_branch is not None:
                        # branch_id takes precedence over branch
                        # https://claude.ai/share/f045e5dc-0143-4bc5-b8a4-38309229f75e
                        if "branch_id" in kwargs:
                            if kwargs["branch_id"] == 1:  # ignore default branch
                                kwargs.pop("branch_id")
                                kwargs["branch"] = current_branch
                        elif "branch" in kwargs:
                            if kwargs["branch"] is None:
                                kwargs["branch"] = current_branch
                        else:
                            kwargs["branch"] = current_branch
                        kwargs["created_on"] = kwargs["branch"]
            if skip_validation:
                super().__init__(**kwargs)
            else:
                from ..core._settings import settings
                from .can_curate import CanCurate
                from .collection import Collection
                from .transform import Transform

                validate_fields(self, kwargs)

                # do not search for names if an id is passed; this is important
                # e.g. when synching ids from the notebook store to lamindb
                has_consciously_provided_uid = False
                if "_has_consciously_provided_uid" in kwargs:
                    has_consciously_provided_uid = kwargs.pop(
                        "_has_consciously_provided_uid"
                    )
                if (
                    isinstance(self, (CanCurate, Collection, Transform))
                    and settings.creation.search_names
                    and not has_consciously_provided_uid
                ):
                    name_field = getattr(self, "_name_field", "name")
                    exact_match = suggest_records_with_similar_names(
                        self, name_field, kwargs
                    )
                    if exact_match is not None:
                        if "version_tag" in kwargs:
                            if kwargs.get("version_tag") is not None:
                                version_comment = " and version"
                                existing_record = self.__class__.filter(
                                    **{
                                        name_field: kwargs[name_field],
                                        "version_tag": kwargs.get("version_tag"),
                                    }
                                ).one_or_none()
                            else:
                                # for a versioned record, an exact name match is not a criterion
                                # for retrieving a record in case `version` isn't passed -
                                # we'd always pull out many records with exactly the same name
                                existing_record = None
                        else:
                            version_comment = ""
                            existing_record = exact_match
                        if existing_record is not None:
                            logger.important(
                                f"returning {self.__class__.__name__.lower()} with same"
                                f" {name_field}{version_comment}: '{kwargs[name_field]}'"
                            )
                            init_self_from_db(self, existing_record)
                            update_attributes(self, kwargs)
                            # track original values after replacing with the existing record
                            self._populate_tracked_fields()
                            return None
                super().__init__(**kwargs)
                if isinstance(self, ValidateFields):
                    # this will trigger validation against django validators
                    try:
                        if hasattr(self, "clean_fields"):
                            self.clean_fields()
                        else:
                            self._Model__clean_fields()
                    except DjangoValidationError as e:
                        message = _format_django_validation_error(self, e)
                        raise FieldValidationError(message) from e
        elif len(args) != len(self._meta.concrete_fields):
            raise FieldValidationError(
                f"Use keyword arguments instead of positional arguments, e.g.: {self.__class__.__name__}(name='...')."
            )
        else:
            super().__init__(*args)
        # track original values of fields that are tracked for changes
        self._populate_tracked_fields()
        # TODO: refactor to use _TRACK_FIELDS
        track_current_name_value(self)

    # used in __init__
    # populates the _original_values dictionary with the original values of the tracked fields
    def _populate_tracked_fields(self):
        if (track_fields := self._TRACK_FIELDS) is not None:
            concrete_attnames = {f.attname for f in self._meta.concrete_fields}
            self._original_values = {}
            for field_name in track_fields:
                if field_name not in concrete_attnames:
                    raise FieldValidationError(
                        f"_TRACK_FIELDS contains invalid field for {self.__class__.__name__}: {field_name}"
                    )
                # deferred model loading (e.g. .only("id") or certain fetching methods during deletion)
                # can omit tracked fields from __dict__;
                # use .get(..., DEFERRED) to avoid KeyError and to show that the field is not loaded yet.
                self._original_values[field_name] = self.__dict__.get(
                    field_name, DEFERRED
                )
        else:
            self._original_values = None

    def _field_changed(self, field_name: str, check_is_saved: bool = True) -> bool:
        """Check if the field has changed since the record was saved."""
        # use _id fields for foreign keys in field_name
        if check_is_saved and self._state.adding:
            return False
        # check if the field is tracked for changes
        track_fields = self._TRACK_FIELDS
        assert track_fields is not None, (
            "_TRACK_FIELDS must be set for the record to track changes"
        )
        assert field_name in track_fields, (
            f"Field {field_name} is not tracked for changes"
        )
        # check if the field has changed since the record was created
        original_value = self._original_values.get(field_name, DEFERRED)
        if original_value is DEFERRED:
            return False
        current_value = self.__dict__.get(field_name, DEFERRED)
        if current_value is DEFERRED:
            return False
        return original_value != current_value

    def save(self: T, *args, **kwargs) -> T:
        """Save.

        Always saves to the default database.
        """
        using_key = None
        if "using" in kwargs:
            using_key = kwargs["using"]
        transfer_config = kwargs.pop("transfer", None)
        db = self._state.db
        pk_on_db = self.pk
        artifacts: list = []
        if self.__class__.__name__ == "Collection" and self.id is not None:
            # when creating a new collection without being able to access artifacts
            artifacts = self.ordered_artifacts.to_list()
        pre_existing_record = None
        # consider records that are being transferred from other databases
        transfer_logs: dict[str, list[str]] = {
            "mapped": [],
            "transferred": [],
            "run": None,
        }
        if db is not None and db != "default" and using_key is None:
            if isinstance(self, IsVersioned):
                if not self.is_latest:
                    raise NotImplementedError(
                        "You are attempting to transfer a record that's not the latest in its version history. This is currently not supported."
                    )
            pre_existing_record = transfer_to_default_db(
                self, using_key, transfer_logs=transfer_logs
            )
        self._revises: IsVersioned
        if pre_existing_record is not None:
            init_self_from_db(self, pre_existing_record)
        else:
            # TODO: refactor to use _TRACK_FIELDS
            check_name_change(self)
            try:
                # save versioned record in presence of self._revises
                if isinstance(self, IsVersioned) and self._revises is not None:
                    revises = self._revises
                    with transaction.atomic():
                        # For branch-aware models (SQLRecord), keep source-branch latest
                        # intact and only demote within the same branch. For other
                        # versioned models (e.g. blocks), keep previous behavior.
                        should_demote = True
                        if hasattr(revises, "branch_id") and hasattr(self, "branch_id"):
                            should_demote = revises.branch_id == self.branch_id
                        if should_demote:
                            assert revises.is_latest  # noqa: S101
                            revises.is_latest = False
                            revises._revises = None  # ensure we don't start a recursion
                            revises.save()
                        super().save(*args, **kwargs)  # type: ignore
                    self._revises = None
                # save unversioned record
                else:
                    super().save(*args, **kwargs)
            except (IntegrityError, ProgrammingError) as e:
                error_msg = str(e)
                # error for hash/uid duplication
                if (
                    self.__class__.__name__ in {"Transform", "Artifact", "Collection"}
                    and isinstance(e, IntegrityError)
                    and "hash" in error_msg
                    and unique_constraint_error_in_error_message(error_msg)
                ):
                    # we also need to include the key here because hash can be the same across keys
                    query_fields = {"hash": self.hash, "key": self.key}
                    if self.__class__.__name__ == "Artifact":
                        # in case of artifact, also storage is needed
                        query_fields["storage"] = self.storage
                    # the get here is Django's get and not aware of the trash or other branches
                    # but generally we bypass branch_id in queries for hash also in LaminDB's get()
                    pre_existing_record = self.__class__.get(**query_fields)
                    from_trash = (
                        "from trash" if pre_existing_record.branch_id == -1 else ""
                    )
                    pre_existing_record.branch_id = 1  # move to default branch
                    logger.warning(
                        f"returning {self.__class__.__name__.lower()} {from_trash} with same hash & key: {pre_existing_record}"
                    )
                    init_self_from_db(self, pre_existing_record)
                elif (
                    isinstance(e, IntegrityError)
                    # for Storage, even if uid was in the error message, we can retrieve based on
                    # the root because it's going to be the same root
                    and any(field in error_msg for field in UNIQUE_FIELD_NAMES)
                    and (
                        "_type_name_at_" not in error_msg
                    )  # constraints for unique type names in Record, ULabel, etc.
                    and (
                        "UNIQUE constraint failed" in error_msg
                        or "duplicate key value violates unique constraint" in error_msg
                    )
                    and hasattr(self, "branch_id")
                ):
                    unique_fields = parse_violated_field_from_error_message(error_msg)
                    # here we query against the all branches with .objects
                    pre_existing_record = self.__class__.objects.get(
                        **{field: getattr(self, field) for field in unique_fields}
                    )
                    # if the existing record is in the default branch, we just return it
                    if pre_existing_record.branch_id == 1:
                        logger.warning(
                            f"returning {self.__class__.__name__} record with same {unique_fields}: '{ {field: getattr(self, field) for field in unique_fields} }'"
                        )
                    # if the existing record is in a different branch we update its fields
                    else:
                        # modifies the fields of the existing record with new values of self
                        field_names = [i.name for i in self.__class__._meta.fields]
                        update_attributes(
                            pre_existing_record,
                            {f: getattr(self, f) for f in field_names},
                        )
                        pre_existing_record.save()
                    init_self_from_db(self, pre_existing_record)
                elif (
                    isinstance(e, ProgrammingError)
                    and "new row violates row-level security policy" in error_msg
                    and (
                        (is_locked := getattr(self, "is_locked", False))
                        or hasattr(self, "space")
                    )
                ):
                    if is_locked:
                        no_write_msg = "It is not allowed to modify or create locked ('is_locked=True') records."
                    else:
                        no_write_msg = (
                            f"You're not allowed to write to the space '{self.space.name}'.\n"
                            "Please contact administrators of the space if you need write access."
                        )
                    raise NoWriteAccess(no_write_msg) from None
                elif (
                    isinstance(e, ProgrammingError)
                    and "permission denied for table" in error_msg
                    and (isettings := setup_settings.instance)._db_permissions
                    == "public"
                ):
                    slug = isettings.slug
                    raise NoWriteAccess(
                        f"You are trying to write to '{slug}' with public (read-only) permissions.\n"
                        "Please contact administrators to make you a collaborator if you need write access.\n"
                        f"If you are already a collaborator, please do 'lamin connect {slug}' in console, "
                        "restart the python session and try again."
                    ) from None
                else:
                    raise
            # call the below in case a user makes more updates to the record
            track_current_name_value(self)
        # perform transfer of many-to-many fields
        # only supported for Artifact and Collection records
        if db is not None and db != "default" and using_key is None:
            if self.__class__.__name__ == "Collection":
                if len(artifacts) > 0:
                    logger.info("transfer artifacts")
                    for artifact in artifacts:
                        artifact.save()
                    self.artifacts.add(*artifacts)
            if hasattr(self, "labels") and transfer_config == "annotations":
                from copy import copy

                # here we go back to original record on the source database
                self_on_db = copy(self)
                self_on_db._state.db = db
                self_on_db.pk = pk_on_db  # manually set the primary key
                self.features._add_from(self_on_db, transfer_logs=transfer_logs)
                self.labels.add_from(self_on_db, transfer_logs=transfer_logs)
            for k, v in transfer_logs.items():
                if k != "run" and len(v) > 0:
                    logger.important(f"{k}: {', '.join(v)}")

        if self.__class__.__name__ in {
            "Artifact",
            "Transform",
            "Run",
            "ULabel",
            "Feature",
            "Schema",
            "Collection",
            "Reference",
        } and not (
            self.__class__.__name__ == "Artifact" and self.kind == "__lamindb_run__"
        ):
            import lamindb as ln

            if ln.context.project is not None:
                self.projects.add(ln.context.project)
        return self

    @class_and_instance_method
    def describe(
        cls_or_self,
        return_str: bool = False,
        include: None | Literal["comments"] = None,
    ) -> None | str:
        """Describe record including relations.

        Args:
            return_str: Return a string instead of printing.
            include: Include additional content. Use ``"comments"`` to display
                readme and comment blocks.
        """
        from ._describe import describe_postgres_sqlite

        if isinstance(cls_or_self, type):
            return type(cls_or_self).describe(cls_or_self, return_str=return_str)  # type: ignore
        else:
            return describe_postgres_sqlite(
                cls_or_self, return_str=return_str, include=include
            )

    def __repr__(
        self: SQLRecord,
        include_foreign_keys: bool = True,
        exclude_field_names: list[str] | None = None,
    ) -> str:
        if exclude_field_names is None:
            exclude_field_names = ["id", "updated_at", "source_code"]
        field_names = [
            field.name
            for field in self._meta.fields
            if (
                not isinstance(field, ForeignKey)
                and field.name not in exclude_field_names
            )
        ]
        if include_foreign_keys:
            field_names += [
                f"{field.name}_id"
                for field in self._meta.fields
                if isinstance(field, ForeignKey)
            ]
        # TODO: harmonize with L426 in query_set.py
        if "created_at" in field_names:
            field_names.remove("created_at")
            field_names.append("created_at")
        if "is_locked" in field_names:
            field_names.remove("is_locked")
            field_names.append("is_locked")
        if "created_on" in field_names:
            field_names.remove("created_on")
            field_names.append("created_on")
        if "version_tag" in field_names:
            field_names.remove("version_tag")
            field_names.append("version_tag")
        if "is_latest" in field_names:
            field_names.remove("is_latest")
            field_names.append("is_latest")
        if field_names[0] != "uid" and "uid" in field_names:
            field_names.remove("uid")
            field_names.insert(0, "uid")
        fields_str = {}
        for k in field_names:
            if k == "n" and getattr(self, k) < 0:
                # only needed for Schema
                continue
            if (
                not k.startswith("_")
                or (k == "_dtype_str" and self.__class__.__name__ == "Feature")
            ) and hasattr(self, k):
                value = getattr(self, k)
                # Force strip the time component of the version
                if k == "version" and value:
                    fields_str[k] = f"'{str(value).split()[0]}'"
                else:
                    fields_str[k] = format_field_value(value)
        fields_joined_str = ", ".join(
            [f"{k}={fields_str[k]}" for k in fields_str if fields_str[k] is not None]
        )
        return f"{self.__class__.__name__}({fields_joined_str})"

    def __str__(self) -> str:
        return self.__repr__()

    def delete(self, permanent: bool | None = None):
        """Delete.

        Args:
            permanent: For consistency, `False` raises an error, as soft delete is impossible.

        Returns:
            When `permanent=True`, returns Django's delete return value: a tuple of
            (deleted_count, {registry_name: count}). Otherwise returns None.
        """
        if permanent is False:
            raise ValueError(
                f"Soft delete is not possible for {self.__class__.__name__}, "
                "use 'permanent=True' or 'permanent=None' for permanent deletion."
            )

        return delete_record(self, is_soft=False)


class Space(BaseSQLRecord):
    """Spaces with managed access for specific users or teams.

    If not setting a space, a :class:`~lamindb.models.SQLRecord` object is accessible to all collaborators of the LaminDB instance because its :attr:`~lamindb.models.SQLRecord.space` field defaults to the built-in `all` space.
    You can create a restricted space through LaminHub either on the instance settings page or the *Spaces* tab of your account page.

    Examples:

        After creating a restricted space through LaminHub, create an artifact in the space::

            space = ln.Space.get(name="Our space")  # get a space
            ln.Artifact("./test.txt", key="test.txt", space=space).save()  # save artifact in space

        You can also move an existing object into a space::

            space = ln.Space.get(name="Our space")  # select a space
            record = ln.Record.get(name="existing label")
            record.space = space
            record.save()  # saved in space "Our space"

        For more examples and background, see :doc:`docs:permissions`, in particular, section :ref:`docs:use-a-restricted-space`.

    Notes:

        All data in this registry is synchronized from LaminHub so that spaces can be shared and reused across multiple LaminDB instances.
    """

    class Meta:
        app_label = "lamindb"
        constraints = [
            models.UniqueConstraint(Lower("name"), name="unique_space_name_lower")
        ]

    id: int = models.SmallAutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    name: str = models.CharField(max_length=100, db_index=True)
    """Name of space."""
    uid: str = CharField(
        editable=False,
        unique=True,
        max_length=12,
        default=base62_12,
        db_index=True,
    )
    """Universal id."""
    description: str | None = TextField(null=True)
    """Description of space."""
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of creation of record."""
    created_by: User = ForeignKey(
        "User", CASCADE, default=None, related_name="+", null=True
    )
    """Creator of space."""
    ablocks: RelatedManager[SpaceBlock]
    """Attached blocks ← :attr:`~lamindb.SpaceBlock.space`."""

    @overload
    def __init__(
        self,
        name: str,
        description: str | None = None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        if not args and "uid" not in kwargs:
            warn = False
            msg = ""
            isettings = setup_settings.instance
            if (dialect := isettings.dialect) != "postgresql":
                warn = True
                msg = f"on {dialect} databases"
            elif not isettings.is_on_hub:
                warn = True
                msg = "on local instances"
            if warn:
                logger.warning(
                    f"creating spaces manually {msg} is possible for demo purposes, "
                    "but does *not* affect access permissions"
                )
        super().__init__(*args, **kwargs)


class Branch(BaseSQLRecord):
    """Branches for change management with archive and trash states.

    .. dropdown:: The 3 built-in branches: `main`, `trash` & `archive`

        The `main` branch acts as the default branch.

        The `trash` branch acts like a trash bin on a file system.
        It you delete a `SQLRecord` object via `.delete()`, it gets moved onto the `trash` branch and scheduled for deletion.

        The `archive` acts like an archive that hides objects from queries and searches without scheduling them for deletion.
        To move an object into the archive, run: `obj.branch_id = 0; obj.save()`.

    Args:
        name: A unique name. When lower-cased, is constrained to be unique across all branches.
        description: A description.

    Examples:

        To create a contribution branch and switch to it, run::

            lamin switch -c my_branch

        To merge a contribution branch into `main`, run::

            lamin switch main  # switch to the main branch
            lamin merge my_branch  # merge contribution branch into main

        To see the current branch along with other information, run::

            lamin info

        To annotate the current branch with a `README.md`, run::

            lamin annotate branch --readme README.md

        To comment on the current branch, run::

            lamin annotate branch --comment "I think we should revisit this, tomorrow, WDYT?"

        To describe the current branch (optionally include comments), run::

            lamin describe branch --include comments

        To trace on which branch a `SQLRecord` object was created, run::

            sqlrecord.created_on.describe()

        To open a Change Request for a branch, run:

        .. tab-set::

            .. tab-item:: CLI

                .. code-block:: bash

                    lamin update branch --status draft  # for current branch
                    lamin update branch --name my_branch --status review  # for any branch

            .. tab-item:: Python

                .. code-block:: python

                    branch = ln.Branch.get(name="my_branch")
                    branch.status = "draft"
                    branch.save()

                    branch.status = "review"
                    branch.save()

        Just like Pull Requests on GitHub, branches are never deleted
        so that the provenance of a change stays traceable.

    .. dropdown:: Managing `is_latest` during branching

        `is_latest` is branch-aware during development and reconciled on merge.

        - Creating a new version on a contribution branch keeps the previous
          version on `main` as `is_latest=True`.
        - After `lamin merge`, only one object per version family remains
          with `is_latest=True` in the target branch.
        - If both source and target branches have `is_latest=True`, the merged
          branch keeps the newest object by `created_at`.

        Example flow::

            # before merge
            # main: v1.is_latest=True
            # contribution branch: v2(revises=v1).is_latest=True
            lamin switch main
            lamin merge my_branch
            # after merge on main: v2.is_latest=True, v1.is_latest=False

    .. dropdown:: Logical vs. physical branching

        LaminDB uses **logical branching** via `SQLRecord`'s `.branch` field, treating `branch` like any other field during queries & tracing,
        and keeping infrastructure simple and platform-agnostic.
        However, it doesn't allow isolating SQL `UPDATE` statements on a branch (only their corresponding `DbWrite` events).
        Here are some notable alternatives:

        - Some Postgres platforms like Supabase or Neon, by contrast, provide physical branching through cloning entire databases.
          This allows for isolated SQL `UPDATE` statements but creates separate, disconnected environments and much overhead.
        - Project Nessie is a versioned catalog for data lakes that tracks file states.
          LaminDB is analogous to Nessie in that it also treats branching on the metadata catalog level
          (considering LaminDB's SQL database as the metadata catalog).
        - Dolt is a specialized database engine that provides storage-level branching.
          It allows branch isolation and merging at the engine level.
          While powerful, it requires using the Dolt database itself.

        Why logical branching? Data science and ML workflows are primarily append-only.
        Because a "change" usually results in a new version of an artifact, transform, or collection or new runs or other new objects rather than an in-place modification,
        the row-level `branch` field provides isolation for 99% of use cases.
        This avoids the technical complexity of row duplication, preserves database integrity, and allows the `is_latest` logic to reconcile versions globally upon merge.

    """

    class Meta:
        app_label = "lamindb"
        constraints = [
            models.UniqueConstraint(Lower("name"), name="unique_branch_name_lower")
        ]

    # below isn't fully implemented but a roadmap
    # - 3: template (hidden in queries & searches)
    # - 2: locked (same as default, but locked for edits except for space admins)
    # - 1: default (visible in queries & searches)
    # - 0: archive (hidden, meant to be kept, locked for edits for everyone)
    # - -1: trash (hidden, scheduled for deletion)

    # An integer higher than >3 codes a branch that can be used for collaborators to create drafts
    # that can be merged onto the main branch in an experience akin to a Pull Request. The mapping
    # onto a semantic branch name is handled through LaminHub.

    id: int = models.AutoField(primary_key=True)
    """An integer id that's synchronized for a family of coupled database instances.

    Among all LaminDB instances, this id is arbitrary and non-unique.
    """
    name: str = models.CharField(max_length=100, db_index=True)
    """Name of branch."""
    uid: str = CharField(
        editable=False,
        unique=True,
        max_length=12,
        default=base62_12,
        db_index=True,
    )
    """Universal id.

    This id is useful if one wants to apply the same patch to many database instances.
    """
    space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1, related_name="+")
    """The space associated with the branch."""
    description: str | None = TextField(null=True)
    """Description of branch."""
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of creation of record."""
    created_by: User = ForeignKey(
        "User", PROTECT, default=current_user_id, related_name="+"
    )
    """Creator of branch."""
    _status_code: int = models.SmallIntegerField(default=0, db_default=0, db_index=True)
    """Status code. -2: closed; -1: merged; 0: standalone; 1: draft; 2: review."""
    _aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True)
    """Auxiliary field for dictionary-like metadata."""
    ablocks: RelatedManager[BranchBlock]
    """Attached blocks ← :attr:`~lamindb.BranchBlock.branch`."""
    users: RelatedManager[User] = models.ManyToManyField(
        "User",
        through="BranchUser",
        related_name="branches",
    )
    """Users linked to this branch (e.g. reviewers) ← :attr:`~lamindb.User.branches`."""
    ulabels: RelatedManager[ULabel] = models.ManyToManyField(
        "ULabel",
        through="BranchULabel",
        related_name="branches",
    )
    """ULabels annotating this branch ← :attr:`~lamindb.BranchULabel.ulabel`."""
    projects: RelatedManager[Project] = models.ManyToManyField(
        "Project",
        through="BranchProject",
        related_name="branches",
    )
    """Projects annotating this branch ← :attr:`~lamindb.BranchProject.project`."""

    @property
    def status(self) -> BranchStatus:
        """Branch status.

        Get and set the status of the branch.

        =============  =====  ==================================================
        status         code   description
        =============  =====  ==================================================
        `closed`       -2     Change Request was closed without merging.
        `merged`       -1     The branch was merged into another branch.
        `standalone`   0      A standalone branch without Change Request.
        `draft`        1      Change Request exists but is not ready for review.
        `review`       2      Change Request is ready for review.
        =============  =====  ==================================================

        The database stores the branch status as an integer code in field `_status_code`.

        Example:

            See the status of a branch::

                branch.status
                #> 'standalone'

            Open a Change Request in draft state::

                branch.status = "draft"
                branch.save()

            Request review for the Change Request::

                branch.status = "review"
                branch.save()

            Query by status::

                ln.Branch.filter(status="merged").to_dataframe()
        """
        return BRANCH_CODE_TO_STATUS.get(self._status_code, "standalone")

    @status.setter
    def status(self, value: BranchStatus) -> None:
        if value not in BRANCH_STATUS_TO_CODE:
            raise ValueError(
                "Invalid branch status. Expected one of: "
                "'standalone', 'draft', 'review', 'merged', 'closed'."
            )
        self._status_code = BRANCH_STATUS_TO_CODE[value]

    @overload
    def __init__(
        self,
        name: str,
        description: str | None = None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)


class BranchUser(BaseSQLRecord, IsLink):
    class Meta:
        app_label = "lamindb"
        unique_together = ("branch", "user", "role")

    id: int = models.BigAutoField(primary_key=True)
    branch: Branch = ForeignKey(Branch, CASCADE, related_name="links_user")
    user: User = ForeignKey("User", PROTECT, related_name="links_branch")
    role: str = CharField(max_length=32, db_index=True)


@doc_args(RECORD_REGISTRY_EXAMPLE)
class SQLRecord(BaseSQLRecord, metaclass=Registry):
    """An object that maps to a row in a SQL table in the database.

    For the inherited `SQLRecord` class method definitions, see :class:`~lamindb.models.BaseSQLRecord`.

    Every `SQLRecord` is a data model that comes with a registry in form of a SQL table in your database.

    Sub-classing `SQLRecord` creates a new registry while instantiating a `SQLRecord` creates a new object.

    {}

    `SQLRecord`'s metaclass is :class:`~lamindb.models.Registry`.

    `SQLRecord` inherits from Django's `Model` class.
    Why does LaminDB call it `SQLRecord` and not `Model`?
    The term `SQLRecord` can't lead to confusion with statistical, machine learning or biological models.
    """

    # we need the db_default when not interacting via django directly on a required field
    branch: Branch = ForeignKey(
        Branch,
        PROTECT,
        default=1,
        db_default=1,
        related_name="+",
    )
    """The current branch of the object - changes e.g. on merge events."""
    created_on: Branch = ForeignKey(
        Branch,
        PROTECT,
        default=1,
        db_default=1,
        related_name="+",
    )
    """The branch on which this object was created - never changes."""
    space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1, related_name="+")
    """The space."""
    is_locked: bool = BooleanField(default=False, db_default=False)
    """Whether the object is locked for edits."""
    _aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True)
    """Auxiliary field for dictionary-like metadata."""

    class Meta:
        abstract = True

    def restore(self) -> None:
        """Restore from trash onto the main branch.

        Does **not** restore descendant objects if the object is `HasType` with `is_type = True`.
        """
        self.branch_id = 1
        self.save()

    def delete(self, permanent: bool | None = None, **kwargs):
        """Delete object.

        If object is `HasType` with `is_type = True`, deletes all descendant objects, too.

        Args:
            permanent: Whether to permanently delete the object (skips trash).
                If `None`, performs soft delete if the object is not already in the trash.

        Returns:
            When `permanent=True`, returns Django's delete return value: a tuple of
            (deleted_count, {registry_name: count}). Otherwise returns None.

        Examples:

            For any `SQLRecord` object `sqlrecord`, call::

                sqlrecord.delete()
        """
        if self._state.adding:
            logger.warning("record is not yet saved, delete has no effect")
            return None
        name_with_module = self.__class__.__get_name_with_module__()

        if name_with_module == "Artifact":
            # this first check means an invalid delete fails fast rather than cascading through
            # database and storage permission errors
            isettings = setup_settings.instance
            if self.storage.instance_uid != isettings.uid and (
                kwargs["storage"] or kwargs["storage"] is None
            ):
                from ..errors import IntegrityError
                from .storage import Storage

                raise IntegrityError(
                    "Cannot simply delete artifacts outside of this instance's managed storage locations."
                    "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
                    f"\n(2) If you want to delete the artifact in storage, please connect to the writing lamindb instance (uid={self.storage.instance_uid})."
                    f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).to_dataframe()}"
                )

        # change branch_id to trash
        trash_branch_id = -1
        if self.branch_id > trash_branch_id and permanent is not True:
            if isinstance(self, HasType) and self.is_type:
                for child in getattr(
                    self, f"query_{self.__class__.__name__.lower()}s"
                )():
                    child.delete()
            delete_record(self, is_soft=True)
            logger.important(f"moved record to trash: {self}")
            return None

        # permanent delete
        if permanent is None:
            object_type_name = self.__class__.__name__
            log_identifier = self.uid if hasattr(self, "uid") else self.pk
            response = input(
                f"{object_type_name} {log_identifier} is already in trash! Are you sure you want to delete it from your"
                " database? You can't undo this action. (y/n) "
            )
            confirm_delete = response == "y"
        else:
            confirm_delete = permanent

        if confirm_delete:
            if name_with_module == "Run":
                from .run import _permanent_delete_runs

                _permanent_delete_runs(self)
                return None
            if name_with_module == "Transform":
                from .transform import _permanent_delete_transforms

                _permanent_delete_transforms(self)
                return None
            if name_with_module == "Artifact":
                from .artifact import delete_permanently

                delete_permanently(
                    self, storage=kwargs["storage"], using_key=kwargs["using_key"]
                )
                return None
            return super().delete()
        return None


def _format_django_validation_error(record: SQLRecord, e: DjangoValidationError):
    """Pretty print Django validation errors."""
    errors = {}
    if hasattr(e, "error_dict"):
        error_dict = e.error_dict
    else:
        error_dict = {"__all__": e.error_list}

    for field_name, error_list in error_dict.items():
        for error in error_list:
            if hasattr(error, "message"):
                msg = error.message
            else:
                msg = str(error)

            if field_name == "__all__":
                errors[field_name] = f"{colors.yellow(msg)}"
            else:
                current_value = getattr(record, field_name, None)
                errors[field_name] = (
                    f"{field_name}: {colors.yellow(current_value)} is not valid\n    → {msg}"
                )

    if errors:
        message = "\n  "
        for _, error in errors.items():
            message += error + "\n  "

        return message


def _get_record_kwargs(record_class) -> list[tuple[str, str]]:
    """Gets the parameters of a SQLRecord from the overloaded signature.

    Example:
        >>> get_record_params(bt.Organism)
        >>> [('name', 'str'), ('taxon_id', 'str | None'), ('scientific_name', 'str | None')]
    """
    source = inspect.getsource(record_class)

    # Find first overload that's not *db_args
    pattern = r"@overload\s+def __init__\s*\(([\s\S]*?)\):\s*\.{3}"
    overloads = re.finditer(pattern, source)

    for single_overload in overloads:
        params_block = single_overload.group(1)
        # This is an additional safety measure if the overloaded signature that we're
        # looking for is not at the top but a "db_args" constructor
        if "*db_args" in params_block:
            continue

        params = []
        for line in params_block.split("\n"):
            line = line.strip()
            if not line or "self" in line:
                continue

            # Extract name and type annotation
            # The regex pattern finds parameter definitions like:
            # Simple: name: str
            # With default: age: int = 0
            # With complex types: items: List[str] = []
            param_pattern = (
                r"(\w+)"  # Parameter name
                r"\s*:\s*"  # Colon with optional whitespace
                r"((?:[^=,]|"  # Type hint: either non-equals/comma chars
                r"(?<=\[)[^[\]]*"  # or contents within square brackets
                r"(?=\]))+)"  # looking ahead for closing bracket
                r"(?:\s*=\s*"  # Optional default value part
                r"([^,]+))?"  # Default value: anything but comma
            )
            match = re.match(param_pattern, line)
            if not match:
                continue

            name, type_str = match.group(1), match.group(2).strip()

            # Keep type as string instead of evaluating
            params.append((name, type_str))

        return params

    return []


def get_name_field(
    registry: type[SQLRecord] | QuerySet | Manager,
    *,
    field: StrField | None = None,
) -> str:
    """Get the 1st char or text field from the registry."""
    if isinstance(registry, (QuerySet, Manager)):
        registry = registry.model
    model_field_names = [i.name for i in registry._meta.fields]

    # set to default name field
    if field is None:
        if hasattr(registry, "_name_field"):
            field = registry._meta.get_field(registry._name_field)
        elif "name" in model_field_names:
            field = registry._meta.get_field("name")
        else:
            # first char or text field that doesn't contain "id"
            for i in registry._meta.fields:
                if "id" in i.name:
                    continue
                if i.get_internal_type() in {"CharField", "TextField"}:
                    field = i
                    break

        # no default name field can be found
        if field is None:
            raise ValueError(
                f"Do not know which field to use as name file for registry {registry}, please pass field"
            )
        else:
            field = field.name  # type:ignore
    if not isinstance(field, str):
        try:
            field = field.field.name
        except AttributeError:
            raise TypeError(
                "please pass a SQLRecord string field, e.g., `CellType.name`!"
            ) from None

    return field


def add_db_connection(db: str, using: str):
    db_config = dj_database_url.config(
        default=db, conn_max_age=600, conn_health_checks=True
    )
    db_config["TIME_ZONE"] = "UTC"
    db_config["OPTIONS"] = {}
    db_config["AUTOCOMMIT"] = True
    connections.settings[using] = db_config


REGISTRY_UNIQUE_FIELD = {"storage": "root", "ulabel": "name"}


def update_fk_to_default_db(
    records: SQLRecord | list[SQLRecord] | QuerySet,
    fk: str,
    using_key: str | None,
    transfer_logs: dict,
):
    # here in case it is an iterable, we are checking only a single record
    # and set the same fks for all other records because we do this only
    # for certain fks where they have to the same for the whole bulk
    # see transfer_fk_to_default_db_bulk
    # todo: but this has to be changed i think, it is not safe as it is now - Sergei
    record = records[0] if isinstance(records, (list, QuerySet)) else records
    if getattr(record, f"{fk}_id", None) is not None:
        # set the space of the transferred record to the current space
        if fk == "space":
            # for space we set the record's space to the current space
            from lamindb import context

            # the default space has id=1
            fk_record_default = Space.get(1) if context.space is None else context.space
        # process non-space fks
        else:
            fk_record = getattr(record, fk)
            field = REGISTRY_UNIQUE_FIELD.get(fk, "uid")
            fk_record_default = fk_record.__class__.filter(
                **{field: getattr(fk_record, field)}
            ).one_or_none()
            if fk_record_default is None:
                from copy import copy

                fk_record_default = copy(fk_record)
                transfer_to_default_db(
                    fk_record_default, using_key, save=True, transfer_logs=transfer_logs
                )
        # re-set the fks to the newly saved ones in the default db
        if isinstance(records, (list, QuerySet)):
            for r in records:
                setattr(r, f"{fk}", None)
                setattr(r, f"{fk}_id", fk_record_default.id)
        else:
            setattr(records, f"{fk}", None)
            setattr(records, f"{fk}_id", fk_record_default.id)


FKBULK = [
    "organism",
    "source",
    "report",  # Run
]


def transfer_fk_to_default_db_bulk(
    records: list | QuerySet, using_key: str | None, transfer_logs: dict
):
    for fk in FKBULK:
        update_fk_to_default_db(records, fk, using_key, transfer_logs=transfer_logs)


def get_transfer_run(record) -> Run:
    from lamindb import settings
    from lamindb.core._context import context
    from lamindb.models import Run, Transform
    from lamindb.models.artifact import WARNING_RUN_TRANSFORM

    slug = record._state.db
    owner, name = get_owner_name_from_identifier(slug)
    cache_using_filepath = (
        ln_setup.settings.cache_dir / f"instance--{owner}--{name}--uid.txt"
    )
    if not cache_using_filepath.exists():
        raise SystemExit("Need to call .connect() before")
    instance_uid = cache_using_filepath.read_text().split("\n")[0]
    # TODO: consider renaming to __lamindb_sync__
    key = f"__lamindb_transfer__/{instance_uid}"
    uid = instance_uid + "0000"
    transform = Transform.filter(uid=uid).one_or_none()
    if transform is None:
        search_names = settings.creation.search_names
        settings.creation.search_names = False
        # TODO: consider renaming to "Sync from"
        transform = Transform(  # type: ignore
            uid=uid, description=f"Transfer from `{slug}`", key=key, kind="function"
        ).save()
        settings.creation.search_names = search_names
    # use the global run context to get the initiated_by_run run id
    if context.run is not None:
        initiated_by_run = context.run
    else:
        if not settings.creation.artifact_silence_missing_run_warning:
            logger.warning(WARNING_RUN_TRANSFORM)
        initiated_by_run = None
    # it doesn't seem to make sense to create new runs for every transfer
    run = Run.filter(transform=transform, initiated_by_run=initiated_by_run).first()
    if run is None:
        run = Run(transform=transform, initiated_by_run=initiated_by_run).save()  # type: ignore
        run.initiated_by_run = initiated_by_run  # so that it's available in memory
    return run


def transfer_to_default_db(
    record: SQLRecord,
    using_key: str | None,
    *,
    transfer_logs: dict,
    save: bool = False,
    transfer_fk: bool = True,
) -> SQLRecord | None:
    if record._state.db is None or record._state.db == "default":
        return None
    registry = record.__class__
    logger.debug(f"transferring {registry.__name__} record {record.uid} to default db")
    record_on_default = registry.objects.filter(uid=record.uid).one_or_none()
    record_str = f"{record.__class__.__name__}(uid='{record.uid}')"
    if transfer_logs["run"] is None:
        transfer_logs["run"] = get_transfer_run(record)
    if record_on_default is not None:
        transfer_logs["mapped"].append(record_str)
        return record_on_default
    else:
        transfer_logs["transferred"].append(record_str)

    if hasattr(record, "created_by_id"):
        record.created_by = None
        record.created_by_id = ln_setup.settings.user.id
    # run & transform
    run = transfer_logs["run"]
    if hasattr(record, "run_id"):
        record.run = None
        record.run_id = run.id
    # deal with denormalized transform FK on artifact and collection
    if hasattr(record, "transform_id"):
        record.transform = None
        record.transform_id = run.transform_id
    # transfer other foreign key fields
    fk_fields = [
        i.name
        for i in record._meta.fields
        if i.get_internal_type() == "ForeignKey"
        if i.name not in {"created_by", "run", "transform", "branch"}
    ]
    if not transfer_fk:
        # don't transfer fk fields that are already bulk transferred
        fk_fields = [fk for fk in fk_fields if fk not in FKBULK]
    for fk in fk_fields:
        update_fk_to_default_db(record, fk, using_key, transfer_logs=transfer_logs)
    record.id = None
    record._state.db = "default"
    if save:
        record.save()
    return None


def track_current_name_value(record: SQLRecord):
    # below, we're using __dict__ to avoid triggering the refresh from the database
    # which can lead to a recursion
    if hasattr(record, "_name_field"):
        record._old_name = record.__dict__.get(record._name_field)


def check_name_change(record: SQLRecord):
    """Warns if a record's name has changed."""
    from lamindb.models import (
        Artifact,
        Collection,
        Feature,
        Schema,
        Storage,
        Transform,
    )

    if (
        not record.pk
        or not hasattr(record, "_old_name")
        or not hasattr(record, "_name_field")
    ):
        return

    # key-like records are not checked here
    if isinstance(record, (Artifact, Collection, Transform)):
        return

    # renaming feature sets is not checked
    if isinstance(record, Schema):
        return

    old_name = record._old_name
    new_name = getattr(record, record._name_field)
    registry = record.__class__.__name__

    if old_name != new_name:
        if hasattr(record, "artifacts") and not isinstance(record, Storage):
            linked_records = (
                # find all artifacts that are linked to this label via a feature with dtype
                # matching on the name aka "[registry]"
                record.artifacts.through.filter(
                    feature___dtype_str__contains=f"[{registry}]",
                    **{f"{registry.lower()}_id": record.pk},
                )
            )
            artifact_uids = list(set(linked_records.to_list("artifact__uid")))
            n = len(artifact_uids)
            if n > 0:
                s = "s" if n > 1 else ""
                es = "es" if n == 1 else ""
                logger.error(
                    f"by {colors.red('renaming label')} from '{old_name}' to '{new_name}' "
                    f"{n} artifact{s} no longer match{es} the label name in storage: {artifact_uids}\n\n"
                    f"   → consider re-curating\n"
                )
        elif isinstance(record, Feature):
            # only internal features of schemas with `itype=Feature` are prone to getting out of sync
            artifact_uids = Artifact.filter(
                schemas__features=record, schemas__itype="Feature"
            ).to_list("uid")
            n = len(artifact_uids)
            if n > 0:
                s = "s" if n > 1 else ""
                es = "es" if n == 1 else ""
                logger.warning(
                    f"by {colors.red('renaming feature')} from '{old_name}' to '{new_name}' "
                    f"{n} artifact{s} no longer match{es} the feature name in storage: {artifact_uids}\n"
                    "  → consider re-curating"
                )


def format_field_value(value: datetime | str | Any, none: str = "None") -> str:
    from datetime import datetime

    if isinstance(value, datetime):
        return value.strftime("%Y-%m-%d %H:%M:%S %Z")
    if isinstance(value, str):
        try:
            value = datetime.fromisoformat(value)
            value = value.strftime("%Y-%m-%d %H:%M:%S %Z")
        except ValueError:
            pass
        return f"'{value}'"
    if value is None:
        return none
    return str(value)


class SQLRecordInfo:
    def __init__(self, registry: Registry):
        self.registry = registry

    def _get_type_for_field(self, field_name: str) -> str:
        field = self.registry._meta.get_field(field_name)
        related_model_name = (
            field.related_model.__name__
            if hasattr(field, "related_model") and field.related_model
            else None
        )
        return related_model_name if related_model_name else field.get_internal_type()

    def _get_base_class_fields(self) -> list[str]:
        return [
            field.name
            for base in self.registry.__bases__
            if hasattr(base, "_meta")
            for field in base._meta.get_fields()
        ]

    def _reorder_fields_by_class(self, fields_to_order: list[Field]) -> list[Field]:
        """Reorders the fields so that base class fields come last."""
        non_base_class_fields = [
            field
            for field in fields_to_order
            if field.name not in self._get_base_class_fields()
        ]
        found_base_class_fields = [
            field
            for field in fields_to_order
            if field.name in self._get_base_class_fields()
        ]
        return non_base_class_fields + found_base_class_fields

    def get_simple_fields(self, return_str: bool = False) -> Any:
        simple_fields = [
            field
            for field in self.registry._meta.get_fields()
            if not (
                isinstance(field, ManyToOneRel)
                or isinstance(field, ManyToManyRel)
                or isinstance(field, ManyToManyField)
                or isinstance(field, ForeignKey)
                or field.name.startswith("_")
                or field.name == "id"
            )
        ]
        simple_fields = self._reorder_fields_by_class(simple_fields)
        if not return_str:
            return simple_fields
        else:
            repr_str = f"  {colors.italic('Simple fields')}\n"
            if simple_fields:
                repr_str += "".join(
                    [
                        f"    .{field_name.name}: {self._get_type_for_field(field_name.name)}\n"
                        for field_name in simple_fields
                    ]
                )
            return repr_str

    def get_relational_fields(self, return_str: bool = False):
        # we ignore ManyToOneRel because it leads to so much clutter in the API
        # also note that our general guideline is to have related_name="+"
        # for ForeignKey fields
        relational_fields = (ManyToOneRel, ManyToManyRel, ManyToManyField, ForeignKey)

        class_specific_relational_fields = [
            field
            for field in self.registry._meta.fields + self.registry._meta.many_to_many
            if isinstance(field, relational_fields)
            and not field.name.startswith(("links_", "_"))
        ]

        non_class_specific_relational_fields = [
            field
            for field in self.registry._meta.get_fields()
            if isinstance(field, relational_fields)
            and not field.name.startswith(("links_", "_"))
        ]
        non_class_specific_relational_fields = self._reorder_fields_by_class(
            non_class_specific_relational_fields
        )

        # Ensure that class specific fields (e.g. Artifact) come before non-class specific fields (e.g. collection)
        filtered_non_class_specific = [
            field
            for field in non_class_specific_relational_fields
            if field not in class_specific_relational_fields
        ]
        ordered_relational_fields = (
            class_specific_relational_fields + filtered_non_class_specific
        )

        # For Record class, move linked_in fields to the end
        if self.registry.__name__ == "Record":
            regular_fields = [
                f
                for f in ordered_relational_fields
                if not f.name.startswith(("linked_", "values_"))
            ]
            linked_fields = [
                f for f in ordered_relational_fields if f.name.startswith("linked_")
            ]
            values_fields = [
                f for f in ordered_relational_fields if f.name.startswith("values_")
            ]
            ordered_relational_fields = regular_fields + linked_fields + values_fields

        core_module_fields = []
        external_modules_fields = []
        for field in ordered_relational_fields:
            field_name = repr(field).split(": ")[1][:-1]
            if field_name.count(".") == 1 and "lamindb" not in field_name:
                external_modules_fields.append(field)
            else:
                core_module_fields.append(field)

        def _get_related_field_type(field) -> str:
            model_name = field.related_model.__get_name_with_module__()
            # Extract the class name (after the last dot if there's a module prefix)
            class_name = model_name.split(".")[-1]
            # Skip replacement for compound names like ArtifactBlock, FeatureBlock, etc.
            if class_name.endswith("Block"):
                # Return just the class name for Block types
                field_type = class_name
            else:
                field_type = (
                    model_name.replace(
                        "Artifact", ""
                    ).replace(  # some fields have an unnecessary 'Artifact' in their name
                        "Collection", ""
                    )  # some fields have an unnecessary 'Collection' in their name
                )
            return (
                self._get_type_for_field(field.name)
                if not field_type.strip()
                else field_type
            )

        core_module_fields_formatted = [
            f"    .{field.name}: {_get_related_field_type(field)}\n"
            for field in core_module_fields
        ]
        external_modules_fields_formatted = [
            f"    .{field.name}: {_get_related_field_type(field)}\n"
            for field in external_modules_fields
        ]

        if not return_str:
            external_modules_fields_by_modules = defaultdict(list)
            for field_str, field in zip(
                external_modules_fields_formatted, external_modules_fields
            ):
                field_type = field_str.split(":")[1].split()[0]
                module_name = field_type.split(".")[0]
                external_modules_fields_by_modules[module_name].append(field)
            return core_module_fields, external_modules_fields_by_modules
        else:
            repr_str = ""

            # Non-external relational fields
            if core_module_fields:
                repr_str += f"  {colors.italic('Relational fields')}\n"
                repr_str += "".join(core_module_fields_formatted)

            # External relational fields
            external_modules = set()
            for field in external_modules_fields_formatted:
                field_type = field.split(":")[1].split()[0]
                external_modules.add(field_type.split(".")[0])

            if external_modules:
                # We want Bionty to show up before other modules
                external_modules = (
                    ["bionty"] + sorted(external_modules - {"bionty"})  # type: ignore
                    if "bionty" in external_modules
                    else sorted(external_modules)
                )
                for ext_module in external_modules:
                    ext_module_fields = [
                        field
                        for field in external_modules_fields_formatted
                        if ext_module in field
                    ]

                    if ext_module_fields:
                        repr_str += (
                            f"  {colors.italic(f'{ext_module.capitalize()} fields')}\n"
                        )
                        repr_str += "".join(ext_module_fields)

            return repr_str


class Migration(BaseSQLRecord):
    app = CharField(max_length=255)
    name = CharField(max_length=255)
    applied: datetime = DateTimeField()

    class Meta:
        db_table = "django_migrations"
        app_label = "lamindb"
        managed = False


LinkORM = IsLink  # backward compat
Record = SQLRecord  # backward compat
BasicRecord = BaseSQLRecord  # backward compat
RecordInfo = SQLRecordInfo  # backward compat


================================================
FILE: lamindb/models/storage.py
================================================
from __future__ import annotations

from typing import (
    TYPE_CHECKING,
    overload,
)
from uuid import UUID

from django.db import models
from lamin_utils import logger
from lamindb_setup import settings as setup_settings
from lamindb_setup.core._hub_core import (
    delete_storage_record,
    get_storage_records_for_instance,
    select_space,
    update_storage_with_space,
)
from lamindb_setup.core._settings_storage import (
    StorageSettings,
    get_storage_type,
    init_storage,
)
from lamindb_setup.core.upath import check_storage_is_empty, create_path

from lamindb.base.fields import (
    CharField,
    TextField,
)

from ..base.uids import base62_12
from .run import TracksRun, TracksUpdates
from .sqlrecord import Space, SQLRecord

if TYPE_CHECKING:
    from lamindb_setup.types import StorageType
    from upath import UPath

    from .artifact import Artifact


class Storage(SQLRecord, TracksRun, TracksUpdates):
    """Storage locations of artifacts such as local directories or S3 buckets.

    A storage location is either a directory (local or a folder in the cloud) or
    an entire S3/GCP bucket.

    A storage location is written to by at most one LaminDB instance: the location’s *managing instance*.
    Some locations are not managed with LaminDB and, hence, do not have a managing instance.

    .. dropdown:: Writable vs. read-only storage locations

        The `instance_uid` field of `Storage` defines its *managing instance*.
        Only if a storage location's `instance_uid` matches your current instance's `uid` (`ln.settings.instance_uid`),
        you can write to it.
        All other storage locations are read-only in your current instance.

        Here is an example (`source <https://lamin.ai/laminlabs/lamindata/transform/dPco79GYgzag0000>`__).

        .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/eHDmIOAxLEoqZ2oK0000.png
           :width: 400px

        Some storage locations are not managed by any LaminDB instance, hence, their `instance_uid` is `None`.

    .. dropdown:: Managing access to storage locations across instances

        You can manage access through LaminHub's fine-grained access management or
        through AWS policies that you attach to your S3 bucket.

        To enable access management via LaminHub, head over to `https://lamin.ai/{account}/infrastructure`.
        By clicking the green button that says "Connect S3 bucket", your collaborators will access data
        based on their LaminHub permissions.
        :doc:`docs:permissions` has more details.

        .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/ze8hkgVxVptSSZEU0000.png
           :width: 800px

        By default, a storage location inherits the access permissions of its instance. If you
        want to further restrict access to a storage location, you can move it into a space::

            space = ln.Space.get(name="my-space")
            storage_loc = ln.Storage.get(root="s3://my-storage-location")
            storage_loc.space = space
            storage_loc.save()

        If you don't want to store data in the cloud, you can use local storage locations: :doc:`faq/keep-artifacts-local`.

    Args:
        root: `str` The root path of the storage location, e.g., `"./mydir"`, `"s3://my-bucket"`, `"s3://my-bucket/myfolder"`, `"gs://my-bucket/myfolder"`, `"/nfs/shared/datasets/genomics"`, `"/weka/shared/models/"`, ...
        description: `str | None = None` An optional description.
        space: `Space | None = None` A space to restrict access permissions to the storage location.
        host: `str | None = None` For local storage locations, a globally unique identifier for the physical machine/server hosting the storage.
            This distinguishes storage locations that may have the same local path but exist on different servers, e.g. `"my-institute-cluster-1"`, `"my-server-abcd"`.

    See Also:
        :attr:`lamindb.core.Settings.storage`
            Current default storage location of your compute session for writing artifacts.
        :attr:`~lamindb.setup.core.StorageSettings`
            Storage settings.
        :doc:`faq/keep-artifacts-local`
            Avoid storing artifacts in the cloud, but keep them on local infrastructure.

    Examples:

        When you create a LaminDB instance, you configure its default storage location via `--storage`::

            lamin init --storage ./mydatadir  # or "s3://my-bucket/myfolder", "gs://my-bucket/myfolder", ...

        View the current default storage location for writing artifacts::

            import lamindb as ln

            ln.settings.storage

        Create a new cloud storage location::

            ln.Storage(root="s3://our-bucket/our-folder").save()

        Create a new local storage location::

            ln.Storage(root="/dir/our-shared-dir", host="our-server-123").save()

        Globally switch to another storage location::

            ln.settings.storage = "/dir/our-shared-dir"  # or "s3://our-bucket/our-folder", "gs://our-bucket/our-folder", ...

        Or if you're operating in `keep-artifacts-local` mode (:doc:`faq/keep-artifacts-local`)::

            ln.settings.local_storage = "/dir/our-other-shared-dir"

        View all storage locations used in your LaminDB instance::

            ln.Storage.to_dataframe()

    Notes:

        .. dropdown:: What is the `.lamindb/` directory inside a storage location?

            It stores all artifacts that are ingested through `lamindb`, indexed by the artifact `uid`.
            This means you don't have to worry about renaming or moving files, as this all happens on the database level.

            Existing artifacts are typically stored in hierarchical structures with semantic folder names.
            Instead of copying such artifacts into `.lamindb/` upon calls of `Artifact("legacy_path").save()`,
            LaminDB registers them with the semantic `key` representing the relative path within the storage location.
            These artifacts are marked with `artifact._key_is_virtual = False` and treated correspondingly.

            There is only a single `.lamindb/` directory per storage location.

        .. dropdown:: What should I do if I want to bulk migrate all artifacts to another storage?

            Currently, you can only achieve this manually and you should be careful with it.

            1. Copy or move artifacts into the desired new storage location
            2. Adapt the corresponding record in the {class}`~lamindb.Storage` registry by setting the `root` field to the new location
            3. If your LaminDB storage location is connected to the hub, you also need to update the storage record on the hub

    """

    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):
        abstract = False
        app_label = "lamindb"

    _name_field: str = "root"

    id: int = models.AutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(
        editable=False, unique=True, max_length=12, default=base62_12, db_index=True
    )
    """Universal id, valid across DB instances."""
    root: str = CharField(db_index=True, unique=True)
    """Root path of storage (cloud or local path)."""
    description: str | None = TextField(null=True)
    """A description."""
    type: StorageType = CharField(max_length=30, db_index=True)
    """Can be "local" vs. "s3" vs. "gs". Is auto-detected from the format of the `root` path."""
    region: str | None = CharField(max_length=64, db_index=True, null=True)
    """Storage region for cloud storage locations. Host identifier for local storage locations."""
    instance_uid: str | None = CharField(max_length=12, db_index=True, null=True)
    """The writing instance.

    Only the LaminDB instance with this `uid` can write to this storage location.
    This instance also governs the access permissions of the storage location unless the location is moved into a space.
    """
    artifacts: Artifact
    """Artifacts contained in this storage location."""

    @overload
    def __init__(
        self,
        root: str,
        *,
        description: str | None = None,
        space: Space | None = None,
        host: str | None = None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        if len(args) == len(self._meta.concrete_fields):
            super().__init__(*args)
            self._old_space_id = self.space_id
            return None
        if args:
            assert len(args) == 1, (  # noqa: S101
                "Storage can only be initialized with a single positional argument, the root path."
            )
            kwargs["root"] = args[0]
        if "host" in kwargs:
            if "type" in kwargs:
                assert kwargs["type"] == "local", (  # noqa: S101
                    "type needs to be 'local' if host is set"
                )
            else:
                kwargs["type"] = "local"
            assert get_storage_type(kwargs["root"]) == "local", (  # noqa: S101
                "root must be a local path if host is set"
            )
            assert "region" not in kwargs, "region must not be set if host is set"  # noqa: S101
            kwargs["region"] = kwargs.pop("host")
            storage_record = Storage.filter(
                root=kwargs["root"], region=kwargs["region"]
            ).one_or_none()
        else:
            storage_record = Storage.filter(root=kwargs["root"]).one_or_none()
        space = kwargs.get("space", None)
        if storage_record is not None:
            from .sqlrecord import init_self_from_db

            init_self_from_db(self, storage_record)
            self._old_space_id = self.space_id
            return None

        skip_mark_storage_root = kwargs.pop("skip_mark_storage_root", False)

        skip_preparation = kwargs.pop("_skip_preparation", False)
        if skip_preparation:
            assert space is None, "`space` must not be set if _skip_preparation is True"  # noqa: S101
            super().__init__(*args, **kwargs)
            return None

        space_uuid = None
        if space is not None:
            hub_space_record = select_space(space.uid)
            if hub_space_record is None:
                raise ValueError(
                    "Please first create a space on the hub: https://docs.lamin.ai/access"
                )
            space_uuid = UUID(hub_space_record["id"])

        # instance_id won't take effect if
        # - there is no write access
        # - the storage location is already managed by another instance
        ssettings, _ = init_storage(
            kwargs["root"],
            instance_id=setup_settings.instance._id,
            instance_slug=setup_settings.instance.slug,
            register_hub=setup_settings.instance.is_on_hub,
            region=kwargs.get("region", None),  # host was renamed to region already
            space_uuid=space_uuid,
            skip_mark_storage_root=skip_mark_storage_root,
        )
        # ssettings performed validation and normalization of the root path
        kwargs["root"] = ssettings.root_as_str  # noqa: S101
        if "instance_uid" in kwargs:
            assert kwargs["instance_uid"] == ssettings.instance_uid  # noqa: S101
        else:
            kwargs["instance_uid"] = ssettings.instance_uid
        if ssettings._uid is not None:  # need private attribute here
            kwargs["uid"] = ssettings._uid
        if "type" not in kwargs:
            kwargs["type"] = ssettings.type
        else:
            assert kwargs["type"] == ssettings.type  # noqa: S101
        if "region" in kwargs:
            assert kwargs["region"] == ssettings.region  # noqa: S101
        else:
            kwargs["region"] = ssettings.region

        is_managed_by_current_instance = (
            ssettings.instance_uid == setup_settings.instance.uid
        )
        if ssettings.instance_uid is not None and not is_managed_by_current_instance:
            is_managed_by_instance = (
                f", is managed by instance with uid {ssettings.instance_uid}"
            )
        else:
            is_managed_by_instance = ""
        hub_message = ""
        if setup_settings.instance.is_on_hub and is_managed_by_current_instance:
            instance_owner = setup_settings.instance.owner
            ui_url = setup_settings.instance.ui_url
            hub_message = f", see: {ui_url}/{instance_owner}/infrastructure"
        managed_message = (
            "created managed"
            if is_managed_by_current_instance
            else "referenced read-only"
        )
        logger.important(
            f"{managed_message} storage location at {kwargs['root']}{is_managed_by_instance}{hub_message}"
        )
        super().__init__(**kwargs)
        self._old_space_id = self.space_id

    @property
    def host(self) -> str | None:
        """Host identifier for local storage locations.

        Is `None` for locations with `type != "local"`.

        A globally unique user-defined host identifier (cluster, server, laptop, etc.).
        """
        if self.type != "local":
            return None
        return self.region

    @property
    def path(self) -> UPath:
        """Path.

        Uses the `.root` field and converts it into a `Path` or `UPath`.
        """
        access_token = self._access_token if hasattr(self, "_access_token") else None
        return create_path(self.root, access_token=access_token)

    def save(self, *args, **kwargs):
        """Save the storage record."""
        if hasattr(self, "_old_space_id") and self._old_space_id != self.space_id:
            update_storage_with_space(storage_lnid=self.uid, space_lnid=self.space.uid)
        super().save(*args, **kwargs)
        return self

    def delete(self, permanent: bool | None = None) -> None:  # type: ignore
        # type ignore is there because we don't use a trash here unlike everywhere else
        """Delete the storage location.

        This errors in case the storage location is not empty.

        Unlike other `SQLRecord`-based registries, this does *not* move the storage record into the trash.

        Args:
            permanent: `False` raises an error, as soft delete is impossible.
        """
        from .. import settings

        if permanent is False:
            raise ValueError(
                "Soft delete is not possible for Storage, "
                "use 'permanent=True' or 'permanent=None' for permanent deletion."
            )
        assert not self.artifacts.exists(), (
            "Cannot delete storage with artifacts in current instance."
        )  # noqa: S101
        # the simple case of a read-only storage location
        if self.instance_uid != setup_settings.instance.uid:
            super(SQLRecord, self).delete()
            return None
        # now the complicated case of a written/managed storage location
        check_storage_is_empty(self.path)
        assert settings.storage.root_as_str != self.root, (  # noqa: S101
            "Cannot delete the current storage location, switch to another."
        )
        if setup_settings.user.handle != "anonymous":  # only attempt if authenticated
            storage_records = get_storage_records_for_instance(
                # only query those storage records on the hub that are managed by the current instance
                setup_settings.instance._id
            )
            for storage_record in storage_records:
                if storage_record["lnid"] == self.uid:
                    assert storage_record["is_default"] in {False, None}, (  # noqa: S101
                        "Cannot delete default storage of instance."
                    )
                    delete_storage_record(storage_record)
        ssettings = StorageSettings(self.root)
        if ssettings._mark_storage_root.exists():
            ssettings._mark_storage_root.unlink(
                missing_ok=True  # this is totally weird, but needed on Py3.11
            )
        super(SQLRecord, self).delete()


================================================
FILE: lamindb/models/transform.py
================================================
from __future__ import annotations

import warnings
from typing import TYPE_CHECKING, overload

from django.db import models
from django.db.models import CASCADE, PROTECT, Q
from lamin_utils import logger
from lamindb_setup.core.hashing import HASH_LENGTH, hash_file, hash_string

from lamindb.base import deprecated
from lamindb.base.fields import (
    CharField,
    DateTimeField,
    ForeignKey,
    TextField,
)
from lamindb.base.users import current_user_id

from .._secret_redaction import redact_secrets_in_source_code
from ..models._is_versioned import process_revises
from ._is_versioned import IsVersioned, _adjust_is_latest_when_deleting_is_versioned
from .run import Run, User
from .sqlrecord import (
    BaseSQLRecord,
    IsLink,
    SQLRecord,
    init_self_from_db,
    update_attributes,
)

if TYPE_CHECKING:
    from datetime import datetime
    from pathlib import Path

    from lamindb.base.types import TransformKind

    from .artifact import Artifact
    from .block import TransformBlock
    from .project import Project, Reference
    from .query_manager import RelatedManager
    from .query_set import QuerySet
    from .record import Record
    from .ulabel import ULabel


# does not inherit from TracksRun because the Transform
# is needed to define a run
class Transform(SQLRecord, IsVersioned):
    """Data transformations such as scripts, notebooks, functions, or pipelines.

    A `transform` can be a function, a script, a notebook, or a
    pipeline. If you execute a transform, you generate a run
    (:class:`~lamindb.Run`). A run has inputs and outputs.

    Pipelines are typically created with a workflow manager (Nextflow, Snakemake,
    Prefect, Flyte, Dagster, redun, Airflow, ...).

    Transforms are versioned so that a given transform version maps on a given
    source code version.

    .. dropdown:: Can I sync transforms to git?

        If you set the environment variable `LAMINDB_SYNC_GIT_REPO` or set
        `ln.settings.sync_git_repo`, a script-like transform is
        synced to its hashed state in a git repository upon calling `ln.track()`::

            ln.settings.sync_git_repo = "https://github.com/laminlabs/lamindb"
            ln.track()

        If the hash isn't found in the git repository, an error is thrown.

        You can also create transforms that map pipelines via `Transform.from_git()`.

    The definition of transforms and runs is consistent with the OpenLineage
    specification where a `transform` would be called a "job" and a `run` a "run".

    Args:
        key: `str | None = None` A short name or path-like semantic key.
        kind: `TransformKind | None = "pipeline"` See :class:`~lamindb.base.types.TransformKind`.
        version: `str | None = None` A version string.
        description: `str | None = None` A description.
        reference: `str | None = None` A reference, e.g., a URL.
        reference_type: `str | None = None` A reference type, e.g., 'url'.
        source_code: `str | None = None` Source code of the transform.
        revises: `Transform | None = None` An old version of the transform.
        skip_hash_lookup: `bool = False` Skip the hash lookup so that a new transform is created even if a transform with the same hash already exists.

    See Also:
        :func:`~lamindb.track`
            Track a script or notebook run.
        :class:`~lamindb.Run`
            Executions of transforms.

    Notes:
        - :doc:`docs:track`
        - :doc:`docs:redun`
        - :doc:`docs:nextflow`
        - :doc:`docs:snakemake`

    Examples:

        Create a transform by running `ln.track()` in a notebook or a script::

            ln.track()

        Create a transform for a standalone function that acts as its own workflow::

            @ln.flow()
            def my_workflow():
                print("Hello, world!")

        Create a transform for a step in a workflow::

            @ln.step()
            def my_step():
                print("One step!")

        Create a transform for a pipeline::

            transform = ln.Transform(key="Cell Ranger", version="7.2.0", kind="pipeline").save()

        Create a transform by saving a Python or shell script or a notebook via the CLI::

            lamin save my_script.py
            lamin save my_script.sh
            lamin save my_notebook.ipynb

    """

    class Meta(SQLRecord.Meta, IsVersioned.Meta):
        abstract = False
        app_label = "lamindb"
        unique_together = ("key", "hash")

    _len_stem_uid: int = 12
    _len_full_uid: int = 16
    _name_field: str = "key"

    id: int = models.AutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(
        editable=False, unique=True, db_index=True, max_length=_len_full_uid
    )
    """Universal id."""
    # the max length equals the max length of an S3 key & the artifact key
    key: str = CharField(db_index=True, max_length=1024)
    """A name or "/"-separated path-like string.

    All transforms with the same key are part of the same version family.
    """
    # db_index on description because sometimes we query for equality in the case of artifacts
    description: str | None = TextField(null=True, db_index=True)
    """A description."""
    kind: TransformKind = CharField(
        max_length=20,
        db_index=True,
        default="pipeline",
    )
    """A string indicating the kind of transform (default `"pipeline"`).

    One of `"pipeline"`, `"notebook"`, `"script"`, or `"function"`.
    """
    source_code: str | None = TextField(null=True)
    """Source code of the transform."""
    hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
    """Hash of the source code."""
    reference: str | None = CharField(max_length=255, db_index=True, null=True)
    """Reference for the transform, e.g., a URL."""
    reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
    """Reference type of the transform, e.g., 'url'."""
    environment: Artifact | None = models.ForeignKey(
        "Artifact", CASCADE, null=True, related_name="_environment_of_transforms"
    )
    """An environment for executing the transform."""
    plan: Artifact | None = models.ForeignKey(
        "Artifact",
        CASCADE,
        null=True,
        related_name="_plan_for_transforms",
        default=None,
    )
    """An optional plan for executing this transform."""
    runs: RelatedManager[Run]
    """Runs of this transform ← :attr:`~lamindb.Run.transform`."""
    ulabels: RelatedManager[ULabel] = models.ManyToManyField(
        "ULabel", through="TransformULabel", related_name="transforms"
    )
    """ULabel annotations of this transform ← :attr:`~lamindb.ULabel.transforms`."""
    linked_in_records: RelatedManager[Record] = models.ManyToManyField(
        "Record", through="RecordTransform", related_name="linked_transforms"
    )
    """This transform is linked in these records as a value ← :attr:`~lamindb.Record.linked_transforms`."""
    records: RelatedManager[Record]
    """Records that annotate this transform ← :attr:`~lamindb.Record.transforms`."""
    predecessors: RelatedManager[Transform] = models.ManyToManyField(
        "self",
        through="TransformTransform",
        symmetrical=False,
        related_name="successors",
    )
    """Preceding transforms ← :attr:`~lamindb.Transform.successors`."""
    successors: RelatedManager[Transform]
    """Subsequent transforms ← :attr:`~lamindb.Transform.predecessors`.

    Allows defining succeeding transforms. Is *not* necessary for data lineage, which is tracked automatically
    whenever an artifact or collection serves as an input for a run.
    """
    projects: RelatedManager[Project]
    """Linked projects ← :attr:`~lamindb.Project.transforms`."""
    references: RelatedManager[Reference]
    """Linked references ← :attr:`~lamindb.Reference.transforms`."""
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of creation of record."""
    updated_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of last update to record."""
    created_by: User = ForeignKey(
        User, PROTECT, default=current_user_id, related_name="created_transforms"
    )
    """Creator of record ← :attr:`~lamindb.User.created_transforms`."""
    ablocks: RelatedManager[TransformBlock]
    """Attached blocks ← :attr:`~lamindb.TransformBlock.transform`."""

    @overload
    def __init__(
        self,
        key: str | None = None,
        kind: TransformKind | None = None,
        version: str | None = None,
        description: str | None = None,
        reference: str | None = None,
        reference_type: str | None = None,
        source_code: str | None = None,
        revises: Transform | None = None,
        skip_hash_lookup: bool = False,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        if len(args) == len(self._meta.concrete_fields):
            super().__init__(*args, **kwargs)
            return None
        if args:
            raise ValueError(
                "Please only use keyword arguments to construct a Transform"
            )
        key: str | None = kwargs.pop("key", None)
        description: str | None = kwargs.pop("description", None)
        revises: Transform | None = kwargs.pop("revises", None)
        version_tag: str | None = kwargs.pop("version_tag", kwargs.pop("version", None))
        kind: TransformKind | None = kwargs.pop("kind", None)
        type: TransformKind | None = kwargs.pop("type", None)
        if type is not None:
            warnings.warn(
                "`type` argument of transform was renamed to `kind` and will be removed in a future release.",
                DeprecationWarning,
                stacklevel=2,
            )
        kind = kind if kind is not None else (type if type is not None else "pipeline")
        reference: str | None = kwargs.pop("reference", None)
        reference_type: str | None = kwargs.pop("reference_type", None)
        branch = kwargs.pop("branch", None)
        branch_id = kwargs.pop("branch_id", 1)
        space = kwargs.pop("space", None)
        space_id = kwargs.pop("space_id", 1)
        skip_hash_lookup: bool = kwargs.pop("skip_hash_lookup", False)
        using_key = kwargs.pop("using_key", None)
        # below is internal use that we'll hopefully be able to eliminate
        uid: str | None = kwargs.pop("uid") if "uid" in kwargs else None
        source_code: str | None = (
            kwargs.pop("source_code") if "source_code" in kwargs else None
        )
        if not len(kwargs) == 0:
            raise ValueError(
                "Only key, description, version, kind, type, revises, reference, "
                f"reference_type can be passed, but you passed: {kwargs}"
            )
        if revises is None:
            # need to check uid before checking key
            if uid is not None:
                revises = (
                    Transform.objects.using(using_key)
                    .filter(uid__startswith=uid[:-4], is_latest=True)
                    .order_by("-created_at")
                    .first()
                )
            elif key is not None:
                candidate_for_revises = (
                    Transform.objects.using(using_key)
                    .filter(~Q(branch_id=-1), key=key, is_latest=True)
                    .order_by("-created_at")
                    .first()
                )
                if candidate_for_revises is not None:
                    revises = candidate_for_revises
                    if candidate_for_revises.source_code is None:
                        # no source code was yet saved, return the same transform
                        logger.important(
                            "no source code was yet saved, returning existing transform with same key"
                        )
                        uid = revises.uid
        if revises is not None and uid is not None and uid == revises.uid:
            if revises.key != key:
                logger.warning("ignoring inconsistent key")
            init_self_from_db(self, revises)
            update_attributes(self, {"description": description})
            return None
        if revises is not None and key is not None and revises.key != key:
            logger.important(f"renaming transform {revises.key} to {key}")
        new_uid, version_tag, key, description, revises = process_revises(
            revises, version_tag, key, description, Transform
        )
        # this is only because the user-facing constructor allows passing a uid
        # most others don't
        if uid is None:
            has_consciously_provided_uid = False
            uid = new_uid
        else:
            has_consciously_provided_uid = True
        hash = None
        if source_code is not None and not skip_hash_lookup:
            hash = hash_string(source_code)
            transform_candidate = Transform.objects.filter(
                ~Q(branch_id=-1),
                hash=hash,
                is_latest=True,
            ).first()
            if transform_candidate is not None:
                init_self_from_db(self, transform_candidate)
                update_attributes(self, {"description": description})
                if key is not None and transform_candidate.key != key:
                    logger.warning(
                        f"key {self.key} on existing transform differs from passed key {key}, keeping original key; update manually if needed or pass skip_hash_lookup if you want to duplicate the transform"
                    )
                return None
        super().__init__(  # type: ignore
            uid=uid,
            description=description,
            key=key,
            kind=kind,
            version_tag=version_tag,
            reference=reference,
            reference_type=reference_type,
            source_code=source_code,
            hash=hash,
            _has_consciously_provided_uid=has_consciously_provided_uid,
            revises=revises,
            branch=branch,
            branch_id=branch_id,
            space=space,
            space_id=space_id,
        )

    @classmethod
    def from_git(
        cls,
        url: str,
        path: str,
        key: str | None = None,
        version: str | None = None,
        entrypoint: str | None = None,
        branch: str | None = None,
        description: str | None = None,
        skip_hash_lookup: bool = False,
    ) -> Transform:
        """Create a transform from a path in a git repository.

        Args:
            url: URL of the git repository.
            path: Path to the file within the repository.
            key: Optional key for the transform.
            version: Optional version tag to checkout in the repository.
            entrypoint: One or several optional comma-separated entrypoints for the transform.
            branch: Optional branch to checkout.
            description: Optional description for the transform.
            skip_hash_lookup: Skip the hash lookup so that a new transform is created even if a transform with the same hash already exists.

        Examples:

            Create from a Nextflow repo and auto-infer the commit hash from its latest version::

                transform = ln.Transform.from_git(
                    url="https://github.com/openproblems-bio/task_batch_integration",
                    path="main.nf"
                ).save()

            Create from a Nextflow repo and checkout a specific version::

                transform = ln.Transform.from_git(
                    url="https://github.com/openproblems-bio/task_batch_integration",
                    path="main.nf",
                    version="v2.0.0"
                ).save()
                assert transform.version_tag == "v2.0.0"

            Create a *sliding transform* from a Nextflow repo's `dev` branch.
            Unlike a regular transform, a sliding transform doesn't pin a specific source code state,
            but adapts to whatever the referenced state on the branch is::

                transform = ln.Transform.from_git(
                    url="https://github.com/openproblems-bio/task_batch_integration",
                    path="main.nf",
                    branch="dev",
                    version="dev",
                ).save()

        Notes:

            A regular transform pins a specific source code state through its commit hash::

                transform.source_code
                #> repo: https://github.com/openproblems-bio/task_batch_integration
                #> path: main.nf
                #> commit: 68eb2ecc52990617dbb6d1bb5c7158d9893796bb

            A sliding transform infers the source code state from a branch::

                transform.source_code
                #> repo: https://github.com/openproblems-bio/task_batch_integration
                #> path: main.nf
                #> branch: dev

            If an entrypoint is provided, it is added to the source code below the path, e.g.::

                transform.source_code
                #> repo: https://github.com/openproblems-bio/task_batch_integration
                #> path: main.nf
                #> entrypoint: myentrypoint
                #> commit: 68eb2ecc52990617dbb6d1bb5c7158d9893796bb

            Note that you can pass a comma-separated list of entrypoints to the `entrypoint` argument.

        """
        from ..core._sync_git import get_and_validate_git_metadata

        url, commit_hash = get_and_validate_git_metadata(url, path, version, branch)
        if key is None:
            key = (
                url.split("/")[-2]
                + "/"
                + url.split("/")[-1].replace(".git", "")
                + "/"
                + path
            )
            logger.important(f"inferred key '{key}' from url & path")
        source_code = f"repo: {url}\npath: {path}"
        if entrypoint is not None:
            source_code += f"\nentrypoint: {entrypoint}"
        if branch is not None and version == branch:
            from urllib.parse import quote

            # sliding transform, no defined source code state
            source_code += f"\nbranch: {branch}"
            reference, reference_type = (
                f"{url}/tree/{quote(branch, safe='')}/{path}",
                "url",
            )
        else:
            # regular transform, defined source code state
            source_code += f"\ncommit: {commit_hash}"
            reference, reference_type = f"{url}/blob/{commit_hash}/{path}", "url"
        return Transform(
            key=key,
            kind="pipeline",
            version=version,
            description=description,
            reference=reference,
            reference_type=reference_type,
            source_code=source_code,
            skip_hash_lookup=skip_hash_lookup,
        )

    @property
    def latest_run(self) -> Run:
        """The latest run of this transform."""
        return self.runs.order_by("-started_at").first()

    @property
    @deprecated(new_name="kind")
    def type(self) -> TransformKind:
        return self.kind

    @type.setter
    def type(self, value: TransformKind):
        self.kind = value

    def view_lineage(self, with_successors: bool = False, distance: int = 5):
        """View lineage of transforms.

        Note that this only accounts for manually defined predecessors and successors.

        Auto-generate lineage through inputs and outputs of runs is not included.
        """
        from .has_parents import view_parents

        return view_parents(
            record=self,
            field="key",
            with_children=with_successors,
            distance=distance,
            attr_name="predecessors",
        )

    def _update_source_code_from_path(self, source_code_path: Path) -> None | str:
        _, transform_hash, _ = hash_file(source_code_path)  # ignore hash_type for now
        source_code = source_code_path.read_text()
        source_code_to_store, redaction_count = redact_secrets_in_source_code(
            source_code
        )
        if redaction_count > 0:
            logger.warning(
                f"redacted {redaction_count} secret-looking assignment(s) before persisting transform source code"
            )
        if self.hash is not None:
            # check if the hash of the transform source code matches
            if transform_hash != self.hash:
                response = input(
                    f"You are about to overwrite existing source code (hash '{self.hash}') for Transform('{self.uid}')."
                    f" Proceed? (y/n) "
                )
                if response == "y":
                    self.source_code = source_code_to_store
                    self.hash = transform_hash
                else:
                    logger.warning("Please re-run `ln.track()` to make a new version")
                    return "rerun-the-notebook"
            else:
                logger.debug("source code is already saved")
        else:
            self.source_code = source_code_to_store
            self.hash = transform_hash
        return None


def _permanent_delete_transforms(transforms: Transform | QuerySet) -> None:
    """Execute bulk DELETE on transforms (runs, then transforms). Used by QuerySet and single-transform paths."""
    from django.db.models import QuerySet as DjangoQuerySet

    from .project import TransformProject

    if isinstance(transforms, Transform):
        db = transforms._state.db or "default"
        qs = Transform.objects.using(db).filter(pk=transforms.pk)
    else:
        db = transforms.db or "default"
        qs = transforms
    objects = list(qs)
    if not objects:
        return
    _adjust_is_latest_when_deleting_is_versioned(objects)
    transform_ids = [o.pk for o in objects]
    TransformProject.objects.using(db).filter(transform_id__in=transform_ids).delete()
    Run.objects.using(db).filter(transform_id__in=transform_ids).delete(permanent=True)
    DjangoQuerySet.delete(qs)


class TransformTransform(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    successor: Transform = ForeignKey(
        "Transform", CASCADE, related_name="links_predecessor"
    )
    predecessor: Transform = ForeignKey(
        "Transform", CASCADE, related_name="links_successor"
    )
    config: dict | None = models.JSONField(default=None, null=True)
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now()
    )
    created_by: User = ForeignKey(
        "lamindb.User", PROTECT, default=current_user_id, related_name="+"
    )

    class Meta:
        app_label = "lamindb"
        unique_together = ("successor", "predecessor")


================================================
FILE: lamindb/models/ulabel.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, overload

import pgtrigger
from django.conf import settings as django_settings
from django.db import models
from django.db.models import CASCADE, PROTECT

from lamindb.base.fields import (
    CharField,
    DateTimeField,
    ForeignKey,
    TextField,
)
from lamindb.errors import FieldValidationError

from ..base.uids import base62_8
from .can_curate import CanCurate
from .feature import Feature
from .has_parents import HasParents, _query_relatives
from .run import Run, TracksRun, TracksUpdates, User, current_user_id
from .sqlrecord import BaseSQLRecord, HasType, IsLink, SQLRecord, _get_record_kwargs
from .transform import Transform

if TYPE_CHECKING:
    from datetime import datetime

    from .artifact import Artifact
    from .block import ULabelBlock
    from .collection import Collection
    from .project import Project
    from .query_manager import RelatedManager
    from .query_set import QuerySet
    from .record import Record
    from .sqlrecord import Branch


class ULabel(SQLRecord, HasType, HasParents, CanCurate, TracksRun, TracksUpdates):
    """Universal labels.

    It behaves like `Record`, just without the ability to link features.

    Args:
        name: `str` A name.
        description: `str | None = None` A description.
        reference: `str | None = None` For instance, an external ID or a URL.
        reference_type: `str | None = None` For instance, `"url"`.

    See Also:
        :class:`~lamindb.Record`
            Like `ULabel`, but with the ability to link features.

    Examples:

        Create a label and annotate an :class:`~lamindb.Artifact`::

            train_split = ln.ULabel(name="train").save()
            artifact.ulabels.add(train_split)

        Query artifacts by label::

            ln.Artifact.filter(ulabels=train_split).to_dataframe()

        Organize ulabels in a type hierarchy, based on the `type` field::

            split_type = ln.ULabel(name="Split", is_type=True).save()
            train_split = ln.ULabel(name="train", type="split_type").save()

        The `type` hierarchy gives rise to a tree. If you need to model a full DAG-like **ontology**, use the `parents`/`children` fields::

            cell_type = ln.Record(name="CellType", is_type=True).save()
            t_cell = ln.Record(name="T Cell", type=cell_type).save()
            cd4_t_cell = ln.Record(name="CD4+ T Cell", type=cell_type).save()
            t_cell.children.add(cd4_t_cell)

        If you work with basic biological entities like cell lines, cell types, tissues,
        consider building on the public biological ontologies in :mod:`bionty`,
        which work in the same way.

    """

    class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):
        abstract = False
        app_label = "lamindb"
        if (
            django_settings.DATABASES.get("default", {}).get("ENGINE")
            == "django.db.backends.postgresql"
        ):
            triggers = [
                pgtrigger.Trigger(
                    name="prevent_ulabel_type_cycle",
                    operation=pgtrigger.Update | pgtrigger.Insert,
                    when=pgtrigger.Before,
                    condition=pgtrigger.Condition("NEW.type_id IS NOT NULL"),
                    func="""
                        -- Check for direct self-reference
                        IF NEW.type_id = NEW.id THEN
                            RAISE EXCEPTION 'Cannot set type: ulabel cannot be its own type';
                        END IF;

                        -- Check for cycles in the type chain
                        IF EXISTS (
                            WITH RECURSIVE type_chain AS (
                                SELECT type_id, 1 as depth
                                FROM lamindb_ulabel
                                WHERE id = NEW.type_id

                                UNION ALL

                                SELECT r.type_id, tc.depth + 1
                                FROM lamindb_ulabel r
                                INNER JOIN type_chain tc ON r.id = tc.type_id
                                WHERE tc.depth < 100
                            )
                            SELECT 1 FROM type_chain WHERE type_id = NEW.id
                        ) THEN
                            RAISE EXCEPTION 'Cannot set type: would create a cycle';
                        END IF;

                        RETURN NEW;
                    """,
                ),
            ]
        # also see raw SQL constraints for `is_type` and `type` FK validity in migrations

    _name_field: str = "name"

    id: int = models.AutoField(primary_key=True)
    """Internal id, valid only in one DB instance."""
    uid: str = CharField(
        editable=False, unique=True, db_index=True, max_length=8, default=base62_8
    )
    """A universal random id, valid across DB instances."""
    name: str = CharField(max_length=150, db_index=True)
    """Name or title of ulabel."""
    type: ULabel | None = ForeignKey("self", PROTECT, null=True, related_name="ulabels")
    """Type of ulabel, e.g., `"donor"`, `"split"`, etc. ← :attr:`~lamindb.ULabel.ulabels`

    Allows to group ulabels by type, e.g., all donors, all split ulabels, etc.
    """
    ulabels: RelatedManager[ULabel]
    """ULabels of this type (can only be non-empty if `is_type` is `True`)."""
    description: str | None = TextField(null=True)
    """A description."""
    reference: str | None = CharField(max_length=255, db_index=True, null=True)
    """A simple reference like URL or external ID."""
    reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
    """Type of simple reference."""
    parents: RelatedManager[ULabel] = models.ManyToManyField(
        "self", symmetrical=False, related_name="children"
    )
    """Parent entities of this ulabel ← :attr:`~lamindb.ULabel.children`.

    For advanced use cases, you can build an ontology under a given `type`.

    Say, if you modeled `CellType` as a `ULabel`, you would introduce a type `CellType` and model the hiearchy of cell types under it.
    """
    children: RelatedManager[ULabel]
    """Child entities of this ulabel.

    Reverse accessor for parents.
    """
    transforms: RelatedManager[Transform]
    """The transforms annotated by this ulabel ← :attr:`~lamindb.Transform.ulabels`."""
    runs: RelatedManager[Run]
    """The runs annotated by this ulabel ← :attr:`~lamindb.Run.ulabels`."""
    artifacts: RelatedManager[Artifact] = models.ManyToManyField(
        "Artifact", through="ArtifactULabel", related_name="ulabels"
    )
    """The artifacts annotated by this ulabel ← :attr:`~lamindb.Artifact.ulabels`."""
    collections: RelatedManager[Collection]
    """The collections annotated by this ulabel ← :attr:`~lamindb.Collection.ulabels`."""
    projects: RelatedManager[Project]
    """The projects annotating this ulabel ← :attr:`~lamindb.Project.ulabels`."""
    branches: RelatedManager[Branch]
    """The branches annotated by this ulabel ← :attr:`~lamindb.Branch.ulabels`."""
    linked_in_records: RelatedManager[Record] = models.ManyToManyField(
        "Record",
        through="RecordULabel",
        related_name="linked_ulabels",
    )
    """Records linking this ulabel as a value ← :attr:`~lamindb.Record.linked_ulabels`."""
    ablocks: RelatedManager[ULabelBlock]
    """Attached blocks ← :attr:`~lamindb.ULabelBlock.ulabel`."""

    @overload
    def __init__(
        self,
        name: str,
        type: ULabel | None = None,
        is_type: bool = False,
        description: str | None = None,
        reference: str | None = None,
        reference_type: str | None = None,
    ): ...

    @overload
    def __init__(
        self,
        *db_args,
    ): ...

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        if len(args) == len(self._meta.concrete_fields):
            super().__init__(*args, **kwargs)
            return None
        if len(args) > 0:
            raise ValueError("Only one non-keyword arg allowed")
        name: str = kwargs.pop("name", None)
        type: str | None = kwargs.pop("type", None)
        is_type: bool = kwargs.pop("is_type", False)
        description: str | None = kwargs.pop("description", None)
        reference: str | None = kwargs.pop("reference", None)
        reference_type: str | None = kwargs.pop("reference_type", None)
        branch = kwargs.pop("branch", None)
        branch_id = kwargs.pop("branch_id", 1)
        space = kwargs.pop("space", None)
        space_id = kwargs.pop("space_id", 1)
        _skip_validation = kwargs.pop("_skip_validation", False)
        _aux = kwargs.pop("_aux", None)
        if len(kwargs) > 0:
            valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(ULabel)])
            raise FieldValidationError(
                f"Only {valid_keywords} are valid keyword arguments"
            )
        super().__init__(
            name=name,
            type=type,
            is_type=is_type,
            description=description,
            reference=reference,
            reference_type=reference_type,
            branch=branch,
            branch_id=branch_id,
            space=space,
            space_id=space_id,
            _skip_validation=_skip_validation,
            _aux=_aux,
        )

    def query_ulabels(self) -> QuerySet:
        """Query ulabels of sub types.

        While `.ulabels` retrieves the ulabels with the current type, this method
        also retrieves sub types and the ulabels with sub types of the current type.
        """
        return _query_relatives([self], "ulabels")  # type: ignore


class ArtifactULabel(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="links_ulabel")
    ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_artifact")
    feature: Feature | None = ForeignKey(
        Feature, PROTECT, null=True, related_name="links_artifactulabel", default=None
    )

    class Meta:
        # can have the same label linked to the same artifact if the feature is
        # different
        app_label = "lamindb"
        unique_together = ("artifact", "ulabel", "feature")


class TransformULabel(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_ulabel")
    ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_transform")

    class Meta:
        app_label = "lamindb"
        unique_together = ("transform", "ulabel")


class RunULabel(BaseSQLRecord, IsLink):
    id: int = models.BigAutoField(primary_key=True)
    run: Run = ForeignKey(Run, CASCADE, related_name="links_ulabel")
    ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_run")
    created_at: datetime = DateTimeField(
        editable=False, db_default=models.functions.Now(), db_index=True
    )
    """Time of creation of record."""
    created_by: User = ForeignKey(
        "lamindb.User", PROTECT, default=current_user_id, related_name="+"
    )
    """Creator of record."""

    class Meta:
        app_label = "lamindb"
        unique_together = ("run", "ulabel")


class BranchULabel(BaseSQLRecord, IsLink):
    """Link model for branch–ulabel association."""

    id: int = models.BigAutoField(primary_key=True)
    branch: Branch = ForeignKey("Branch", CASCADE, related_name="links_ulabel")
    ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_branch")

    class Meta:
        app_label = "lamindb"
        unique_together = ("branch", "ulabel")


class CollectionULabel(BaseSQLRecord, IsLink, TracksRun):
    id: int = models.BigAutoField(primary_key=True)
    collection: Collection = ForeignKey(
        "Collection", CASCADE, related_name="links_ulabel"
    )
    ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_collection")
    feature: Feature | None = ForeignKey(
        Feature, PROTECT, null=True, related_name="links_collectionulabel", default=None
    )

    class Meta:
        app_label = "lamindb"
        unique_together = ("collection", "ulabel")


================================================
FILE: lamindb/py.typed
================================================


================================================
FILE: lamindb/setup/__init__.py
================================================
import lamindb_setup as _lamindb_setup
from lamindb_setup import *  # noqa: F403
from lamindb_setup import (
    connect,
    delete,
    init,
    settings,
)

from . import core, errors, types
from ._merge import merge  # noqa: F401
from ._switch import switch  # noqa: F401

del connect  # we have this at the root level, hence, we don't want it here
__doc__ = _lamindb_setup.__doc__.replace("lamindb_setup", "lamindb.setup")
settings.__doc__ = settings.__doc__.replace("lamindb_setup", "lamindb.setup")


================================================
FILE: lamindb/setup/_merge.py
================================================
# Tested in lamin-cli (tests/core/test_create_switch_delete_list_settings.py::test_merge*).
from __future__ import annotations

from typing import TYPE_CHECKING

import lamindb_setup as ln_setup
from django.apps import apps
from django.db import connection
from django.db.utils import DatabaseError
from lamin_utils import logger

if TYPE_CHECKING:
    from lamindb.models import Branch


def merge(branch: str | Branch) -> None:
    """Merge a branch into the current branch.

    All `SQLRecord` objects that have `branch_id` equal to the source branch's id
    are updated to the current branch's id.

    Find more info in the :class:`~lamindb.Branch` document.

    Args:
        branch: The source branch to merge from. Accepts a `name`, a `uid`, or the `Branch` object.

    Raises:
        DoesNotExist: If the branch does not exist.
    """
    from lamindb import Branch, Q
    from lamindb.errors import ObjectDoesNotExist

    from ..models import SQLRecord
    from ..models._is_versioned import IsVersioned, reconcile_is_latest_within_branch
    from ..models.sqlrecord import BRANCH_SENSITIVE_BLOCK_MODEL_NAMES

    if isinstance(branch, Branch):
        source = branch
        if source._state.adding:
            raise ObjectDoesNotExist("Branch must be saved.")
    else:
        source = Branch.filter(Q(name=branch) | Q(uid=branch)).one_or_none()
        if source is None:
            raise ObjectDoesNotExist(f"Branch '{branch}' not found.")

    current = ln_setup.settings.branch
    if current.id == source.id:
        logger.important("already on branch, nothing to merge")
        return

    sqlrecord_models = [
        m
        for m in apps.get_models()
        if issubclass(m, SQLRecord) and not m._meta.abstract
    ]
    attached_block_models = [
        model
        for model_name in sorted(BRANCH_SENSITIVE_BLOCK_MODEL_NAMES)
        if (model := apps.get_model("lamindb", model_name)) is not None
    ]
    models = list(dict.fromkeys([*sqlrecord_models, *attached_block_models]))
    if not models:
        return

    vendor = connection.vendor
    quoted_tables = [connection.ops.quote_name(m._meta.db_table) for m in models]

    with connection.cursor() as cursor:
        if vendor == "postgresql":
            # Single round-trip: one multi-statement execute
            statements = [
                f"UPDATE {tbl} SET branch_id = %s WHERE branch_id = %s"
                for tbl in quoted_tables
            ]
            sql = "BEGIN; " + "; ".join(statements) + "; COMMIT;"
            params = [current.id, source.id] * len(quoted_tables)
            try:
                cursor.execute(sql, params)
            except DatabaseError as e:
                logger.error(f"Merge failed: {e}")
                raise
        else:
            # SQLite: execute() runs only the first statement; run each UPDATE
            # in a loop (same connection, so still one transaction if we're inside
            # a transaction or use autocommit-off).
            from django.db import transaction

            with transaction.atomic():
                for tbl in quoted_tables:
                    # Django uses %s; SQLite backend converts to ?
                    cursor.execute(
                        f"UPDATE {tbl} SET branch_id = %s WHERE branch_id = %s",
                        [current.id, source.id],
                    )

    versioned_models = [m for m in models if issubclass(m, IsVersioned)]
    for model in versioned_models:
        reconcile_is_latest_within_branch(model, branch_id=current.id)

    source._status_code = -1  # merged
    source.save(update_fields=["_status_code"])
    logger.important(f"merged branch '{source.name}' into '{current.name}'")


================================================
FILE: lamindb/setup/_switch.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from lamin_utils import logger
from lamindb_setup import settings

if TYPE_CHECKING:
    from lamindb.models import Branch


def switch(target: str | Branch, *, space: bool = False, create: bool = False):
    """Switch to a branch or space, create if not exists.

    Args:
        target: Branch target or space target to switch to.
        space: If True, switch space; otherwise switch branch.
        create: If True and switching branch, create the branch if it does not exist.
    """
    if space:
        settings.space = target
    else:
        if create:
            from lamindb import Branch, Q
            from lamindb.errors import BranchAlreadyExists

            # Consistent with git switch -c: error if branch already exists.
            existing = Branch.filter(Q(name=target) | Q(uid=target)).one_or_none()
            if existing is not None:
                raise BranchAlreadyExists(
                    f"Branch '{target}' already exists. Omit -c/--create to switch to it."
                )
            Branch(name=target).save()
            logger.important(f"created branch: {target}")
        settings.branch = target
    logger.important(f"switched to {target}")


================================================
FILE: lamindb/setup/core/__init__.py
================================================
import lamindb_setup as _lamindb_setup
from lamindb_setup.core import *  # noqa: F403

__doc__ = _lamindb_setup.core.__doc__.replace("lamindb_setup", "lamindb.setup")


================================================
FILE: lamindb/setup/errors/__init__.py
================================================
import lamindb_setup as _lamindb_setup
from lamindb_setup.errors import *  # noqa: F403

__doc__ = _lamindb_setup.errors.__doc__.replace("lamindb_setup", "lamindb.setup")


================================================
FILE: lamindb/setup/types/__init__.py
================================================
import lamindb_setup as _lamindb_setup
from lamindb_setup.types import *  # noqa: F403

__doc__ = _lamindb_setup.types.__doc__.replace("lamindb_setup", "lamindb.setup")


================================================
FILE: lamindb_full.py
================================================
"""Full/meta-package module for the `lamindb` distribution."""

from __future__ import annotations

import re
from pathlib import Path

_INIT_FILE = Path(__file__).parent / "lamindb" / "__init__.py"
_MATCH = re.search(r'__version__\s*=\s*"([^"]+)"', _INIT_FILE.read_text())
if _MATCH is None:
    raise RuntimeError(f"Could not parse __version__ from {_INIT_FILE}")

__version__ = _MATCH.group(1)


================================================
FILE: noxfile.py
================================================
import os
import shutil
from pathlib import Path

import nox
from laminci import convert_executable_md_files, upload_docs_artifact
from laminci.nox import (
    build_docs,
    login_testuser1,
    login_testuser2,
    run,
    run_pre_commit,
)

# we'd like to aggregate coverage information across sessions
# and for this the code needs to be located in the same
# directory in every github action runner
# this also allows to break out an installation section
nox.options.default_venv_backend = "none"

IS_PR = os.getenv("GITHUB_EVENT_NAME") != "push"
CI = os.environ.get("CI")
# SpatialData.write() regression with ome-zarr>=0.14:
# https://github.com/scverse/spatialdata/issues/1090
SPATIALDATA_OME_ZARR_CONSTRAINT = "ome-zarr<0.14.0"


GROUPS = {}
GROUPS["tutorial"] = [
    "README.ipynb",
    "sync.ipynb",
    "arrays.ipynb",
    "registries.ipynb",
]
GROUPS["guide"] = [
    "track.ipynb",
]
GROUPS["tiledbsoma"] = [
    "curate.ipynb",
]
GROUPS["biology"] = [
    "manage-ontologies.ipynb",
]


@nox.session
def lint(session: nox.Session) -> None:
    run_pre_commit(session)


@nox.session
def install(session):
    base_deps = [
        "./sub/lamin-cli",
        "./sub/lamindb-setup",
        "./sub/bionty",
    ]
    top_deps = [
        ".[full,dev]",
    ]
    cmds = [
        f"uv pip install {'--system' if CI else ''} --no-cache-dir {' '.join(base_deps)}",
    ] + [
        f"uv pip install {'--system' if CI else ''} --no-cache-dir -e {dep}"
        for dep in top_deps
    ]
    [run(session, line) for line in cmds]


@nox.session
@nox.parametrize(
    "group",
    [
        "unit-core-sqlite",
        "unit-core-postgres",
        "unit-storage",
        "no-instance",
        "tutorial",
        "guide",
        "tiledbsoma",
        "biology",
        "faq",
        "storage",
        "curator",
        "integrations",
        "docs",
        "cli",
        "permissions",
    ],
)
def install_ci(session, group):
    extras = ""
    if group in ["unit-core-sqlite", "unit-core-postgres"]:
        extras += "fcs"
        run(session, "uv pip install --system scanpy")
        run(session, "uv pip install --system mudata")
        # spatialdata dependency, specifying it here explicitly
        # otherwise there are problems with uv resolver
        run(session, "uv pip install --system xarray-dataclasses")
        run(
            session,
            f"uv pip install --system spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT}",
        )
    elif group == "unit-storage":
        extras += "gcp"
        run(session, "uv pip install --system huggingface_hub")
        run(session, "uv pip install --system scanpy")
        run(session, "uv pip install --system polars")
    elif group == "tutorial":
        # anndata here to prevent installing older version on release
        run(session, "uv pip install --system huggingface_hub polars anndata==0.12.2")
    elif group == "guide":
        extras += "zarr_v2"
        run(
            session,
            f"uv pip install --system scanpy mudata spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT}",
        )
    elif group == "tiledbsoma":
        extras += "zarr_v2"
        run(
            session,
            f"uv pip install --system scanpy mudata spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT} tiledbsoma",
        )
    elif group == "biology":
        extras += "fcs"
        run(session, "uv pip install --system ipywidgets")
    elif group == "faq":
        extras += "zarr_v2"
    elif group == "storage":
        extras += "zarr_v2"
        run(
            session,
            "uv pip install --system --no-deps ./sub/pertdb",
        )
        run(session, "uv pip install --system vitessce")
    elif group == "curator":
        run(
            session,
            "uv pip install --system --no-deps ./sub/pertdb",
        )
        # spatialdata dependency, specifying it here explicitly
        # otherwise there are problems with uv resolver
        run(session, "uv pip install --system xarray-dataclasses")
        run(
            session,
            f"uv pip install --system spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT}",
        )
    elif group == "integrations":
        run(session, "uv pip install --system lightning")
    elif group == "docs":
        extras += "zarr_v2"
        # spatialdata dependency, specifying it here explicitly
        # otherwise there are problems with uv resolver
        run(session, "uv pip install --system xarray-dataclasses")
        run(
            session,
            f"uv pip install --system mudata spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT} lightning",
        )
        run(
            session,
            "uv pip install --system --no-deps ./sub/pertdb",
        )
    elif group == "cli":
        pass
    elif group == "permissions":
        pass

    extras = "," + extras if extras != "" else extras
    run(session, f"uv pip install --system -e .[full,dev{extras}]")

    # on the release branch, do not use submodules but run with pypi install
    # only exception is the docs group which should always use the submodule
    # to push docs fixes fast
    # installing this after lamindb to be sure that these packages won't be reinstaled
    # during lamindb installation
    if IS_PR or group == "docs":
        run(
            session,
            "uv pip install --system ./sub/lamindb-setup ./sub/lamin-cli ./sub/bionty ./sub/pertdb",
        )
    if group == "permissions":
        # have to install after lamindb installation
        # because lamindb downgrades django required by laminhub_rest
        cmds = "uv pip install --system ./laminhub/backend"
        cmds += "\nuv pip install --system ./laminhub/backend/utils"
        cmds += "\nuv pip install --system ./laminhub/backend/services/central"
        cmds += "\nuv pip install --system ./laminhub/backend/services/instancedb"
        cmds += "\nuv pip install --system ./laminhub/backend/services/aws"
        cmds += "\nuv pip install --system --no-deps ./laminhub/backend/services/instancedb/hubmodule"
        [run(session, line) for line in cmds.splitlines()]


@nox.session
def configure_coverage(session) -> None:
    """Write a coverage config file, adding extra patterns to omit."""
    import tomlkit

    groups_str = session.posargs[0]  # first positional argument

    print(groups_str)  # for debugging
    # so that we don't change this away from string
    assert isinstance(groups_str, str)  # noqa: S101

    if "curator" not in groups_str and "tiledbsoma" not in groups_str:
        extra_omit_patterns = ["**/curators/*"]
    else:
        extra_omit_patterns = []

    # Read patterns from pyproject.toml
    base_config_path = Path("pyproject.toml")
    with open(base_config_path) as f:
        config = tomlkit.load(f)

    # Update the omit patterns
    base_patterns = config["tool"]["coverage"]["run"]["omit"]
    all_patterns = base_patterns + extra_omit_patterns
    config["tool"]["coverage"]["run"]["omit"] = all_patterns

    # Write back to pyproject.toml
    with open(base_config_path, "w") as f:
        tomlkit.dump(config, f)

    print(base_config_path.read_text())


@nox.session
def prepare(session):
    """Create executable files to run during a test session.

    Is not needed for unit tests!
    """
    content = open("README.md").read()
    # cannot execute the flow after ln.track() was called
    content = content.replace("    create_fasta()", "    pass")
    open("README_stripped.md", "w").write(
        "\n".join(
            line
            for line in content.split("\n")
            if not line.strip().startswith(
                ("accessor = artifact.open()", "ln.track(project=", "ln.Project(name=")
            )
        )
    )

    os.system("jupytext README_stripped.md --to notebook --output ./docs/README.ipynb")
    convert_executable_md_files()
    os.system("cp ./tests/core/test_artifact_parquet.py ./docs/scripts/")
    os.system("cp ./lamindb/examples/schemas/define_valid_features.py ./docs/scripts/")
    os.system(
        "cp ./lamindb/examples/schemas/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py ./docs/scripts/"
    )
    os.system(
        "cp ./lamindb/examples/datasets/define_mini_immuno_features_labels.py ./docs/scripts/"
    )
    os.system(
        "cp ./lamindb/examples/datasets/define_mini_immuno_schema_flexible.py ./docs/scripts/"
    )
    os.system(
        "cp ./lamindb/examples/datasets/save_mini_immuno_datasets.py ./docs/scripts/"
    )


@nox.session
@nox.parametrize(
    "group",
    [
        "unit-core-sqlite",
        "unit-core-postgres",
        "unit-storage",
        "no-instance",
        "curator",
        "integrations",
        "tutorial",
        "guide",
        "tiledbsoma",
        "biology",
        "faq",
        "storage",
        "cli",
        "permissions",
    ],
)
def test(session, group):
    # we likely don't need auth in many other groups, but have to carefully expand this
    if group not in {"curator", "no-instance"}:
        login_testuser2(session)
        login_testuser1(session)
    # this is mostly needed for the docs so that we don't render Django's entire public API
    run(session, "lamin settings set private-django-api true")
    coverage_args = "--cov=lamindb --cov-config=pyproject.toml --cov-append --cov-report=term-missing"
    duration_args = "--durations=10"

    env = os.environ.copy()
    if group == "unit-core-sqlite":
        env["LAMINDB_TEST_DB_VENDOR"] = "sqlite"
        run(
            session,
            f"pytest {coverage_args} ./tests/core {duration_args}",
            env=env,
        )
    elif group == "unit-core-postgres":
        env["LAMINDB_TEST_DB_VENDOR"] = "postgresql"
        run(
            session,
            f"pytest {coverage_args} ./tests/core {duration_args}",
            env=env,
        )
    elif group == "unit-storage":
        login_testuser2(session)  # shouldn't be necessary but is for now
        run(session, f"pytest {coverage_args} ./tests/storage {duration_args}")
    elif group == "no-instance":
        run(session, "lamin disconnect")
        run(session, f"pytest {coverage_args} ./tests/no_instance {duration_args}")
    elif group == "tutorial":
        run(session, "lamin logout")
        run(session, "lamin init --storage ./test-readme --modules bionty")
        run(
            session, f"pytest -s {coverage_args} ./docs/test_notebooks.py::test_{group}"
        )
    elif group == "guide":
        run(
            session,
            f"pytest -s {coverage_args} ./docs/test_notebooks.py::test_{group}",
        )
    elif group == "tiledbsoma":
        run(
            session,
            (
                f"pytest {coverage_args} tests/tiledbsoma "
                "./docs/test_notebooks.py::test_tiledbsoma "
                f"{duration_args}"
            ),
        )
    elif group == "biology":
        run(
            session,
            f"pytest -s {coverage_args} ./docs/test_notebooks.py::test_{group}",
        )
    elif group == "faq":
        run(session, f"pytest -s {coverage_args} ./docs/faq")
    elif group == "storage":
        run(session, f"pytest -s {coverage_args} ./docs/storage")
    elif group == "curator":
        run(
            session,
            f"pytest {coverage_args} tests/curators {duration_args}",
        )
    elif group == "integrations":
        run(session, f"pytest -s {coverage_args} tests/integrations")
    elif group == "cli":
        run(
            session,
            f"pytest {coverage_args} ./sub/lamin-cli/tests/core {duration_args}",
        )
    elif group == "permissions":
        run(session, f"pytest {coverage_args} ./tests/permissions")
    # move artifacts into right place
    if group in {"tutorial", "guide", "tiledbsoma", "biology"}:
        target_dir = Path(f"./docs/{group}")
        target_dir.mkdir(exist_ok=True)
        for filename in GROUPS[group]:
            shutil.copy(Path("docs") / filename, target_dir / filename)


@nox.session
def clidocs(session):
    def generate_cli_docs():
        os.environ["NO_RICH"] = "1"
        from lamin_cli.__main__ import COMMAND_GROUPS, _generate_help

        page = "# CLI\n\n"
        helps = _generate_help()

        # First, add the main lamin command
        main_help = helps.get("main")
        if main_help:
            help_string = main_help["help"].replace("Usage: main", "Usage: lamin")
            help_docstring = main_help["docstring"]
            if help_docstring:
                page += f"{help_docstring}\n\n"
            # below is ugly
            # page += f"```text\n{help_string}\n```\n\n"

        # Create a mapping of command names to their full keys in helps
        command_to_key = {}
        for name in helps.keys():
            names = name.split(" ")
            if len(names) == 2:  # e.g., "lamin connect"
                command_name = names[1]
                command_to_key[command_name] = name

        # Group commands by their categories
        command_groups = COMMAND_GROUPS.get("lamin", [])
        processed_commands = set()

        for group in command_groups:
            group_name = group["name"]
            group_commands = group["commands"]

            page += f"## {group_name}\n\n"

            for command_name in group_commands:
                if command_name in command_to_key:
                    full_key = command_to_key[command_name]
                    help_dict = helps[full_key]
                    processed_commands.add(command_name)

                    help_string = help_dict["help"].replace("Usage: main", "lamin")
                    help_docstring = help_dict["docstring"]

                    pyr_alt_delimiter = "→ Python/R alternative:"

                    if pyr_alt_delimiter in help_docstring:
                        help_docstring, pyr_alt_string = help_docstring.split(
                            pyr_alt_delimiter
                        )
                    else:
                        pyr_alt_string = ""

                    page += f"### {command_name}\n\n"
                    if help_docstring:
                        page += f"{help_docstring}\n"
                    command_block = f"```text\n{help_string}\n```"
                    page += f"\n\nOptions:\n\n{command_block}\n\n"
                    if pyr_alt_string:
                        page += f"{pyr_alt_delimiter}{pyr_alt_string}\n\n"

        # Add any remaining commands that aren't in groups
        remaining_commands = []
        for command_name, full_key in command_to_key.items():
            if command_name not in processed_commands:
                remaining_commands.append((command_name, full_key))

        if remaining_commands:
            page += "## Other\n\n"
            for command_name, full_key in remaining_commands:
                help_dict = helps[full_key]
                help_string = help_dict["help"].replace("Usage: main", "Usage: lamin")
                help_docstring = help_dict["docstring"]

                page += f"### lamin {command_name}\n\n"
                if help_docstring:
                    page += f"{help_docstring}\n\n"
                page += f"```text\n{help_string}\n```\n\n"

        Path("./docs/cli.md").write_text(page)

    generate_cli_docs()


@nox.session
def docs(session):
    # move artifacts into right place
    run(session, "lamin settings set private-django-api true")
    for group in ["tutorial", "guide", "tiledbsoma", "biology", "faq", "storage"]:
        if Path(f"./docs-{group}").exists():
            if Path(f"./docs/{group}").exists():
                shutil.rmtree(f"./docs/{group}")
            Path(f"./docs-{group}").rename(f"./docs/{group}")
        # move back to root level
        if group in {"tutorial", "guide", "tiledbsoma", "biology"}:
            for path in Path(f"./docs/{group}").glob("*"):
                path.rename(f"./docs/{path.name}")
    run(
        session,
        "lamin init --storage ./docsbuild --modules bionty,pertdb",
    )
    build_docs(session, strip_prefix=True, strict=False)
    upload_docs_artifact()


================================================
FILE: pyproject.full.toml
================================================
[build-system]
requires = ["flit_core >=3.2,<4"]
build-backend = "flit_core.buildapi"

[project]
name = "lamindb"
requires-python = ">=3.10,<=3.14"
authors = [{name = "Lamin Labs", email = "open-source@lamin.ai"}]
readme = "README.md"
dynamic = ["version", "description"]
classifiers = [
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
]
dependencies = [
    "lamindb-core[full]==2.4.2",
]

[project.urls]
Home = "https://github.com/laminlabs/lamindb"

[project.optional-dependencies]
gcp = [
    "lamindb_setup[gcp]",
]
zarr_v2 = [
    "numcodecs<0.16.0", # 0.16.0 breaks zarr<3.0.*
    "zarr>=2.16.0,<3.0.0a0", # not yet compatible with 3.0.*
]
fcs = [
    "readfcs>=2.0.1",
]
dev = [
    # basic test
    "tomlkit",
    "line_profiler",
    "pre-commit",
    "nox",
    "laminci>=0.3",
    "pytest>=6.0",
    "coverage",
    "pytest-cov<7.0.0",  # v7 drops support for subprocess measurement
    "mudata",
    # others
    "nbproject_test>=0.6.0",
    # biology
    "faker-biology",
    # bionty
    "pronto",
]

[tool.flit.module]
name = "lamindb_full"

[tool.flit.sdist]
exclude = [
    "sub/"
]


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["flit_core >=3.2,<4"]
build-backend = "flit_core.buildapi"

[project]
name = "lamindb-core"
requires-python = ">=3.10,<=3.14"
authors = [{name = "Lamin Labs", email = "open-source@lamin.ai"}]
readme = "README.md"
dynamic = ["version", "description"]
classifiers = [
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
]
dependencies = [
    "lamin_utils==0.16.4",  # no dependencies
    "lamin_cli==1.16.0",  # no dependencies
    "lamindb_setup[aws]==1.25a1",  # dependencies like Django & fsspec
    "psycopg2-binary",
]

[project.urls]
Home = "https://github.com/laminlabs/lamindb"

[project.optional-dependencies]
# full: keep in sync with pyproject.full.toml dependencies (excluding lamindb-core).
# If you change duplicated deps here, update pyproject.full.toml too, and vice versa.
full = [
    # LaminDB optional modules, included to avoid users forgetting about extras
    "bionty>=2.3.1,<3",  # 30kB pure python, no dependencies
    "pertdb>=2.2.0,<3",  # 30kB pure python, no dependencies
    # Jupyter -- small packages with few & small dependencies
    "jupytext",
    "nbconvert>=7.2.1",  # bound to avoid lxml[html_clean] dependency
    "nbproject==0.11.1",  # adds orjson
    # Data & validation dependencies (heavier)
    "pyarrow",
    "pandera>=0.24.0",
    "pandas>=2.0.0,<3.0.0",  # for .infer_objects(copy=False) in lamin-utils; not yet compatible with Pandas 3.0.0
    "anndata>=0.10.0,<=0.12.10",  # backed sparse is incompatible with scipy 1.15.0 for anndata 1.11.1
    # Runtime utilities
    "graphviz",
    "scipy<1.17.0",  # 1.17.0 is incompatible with anndata<0.12.7
    "pyyaml",
    "typing_extensions!=4.6.0",
    "python-dateutil",
]
gcp = [
    "lamindb_setup[gcp]",
]
zarr_v2 = [
    "numcodecs<0.16.0", # 0.16.0 breaks zarr<3.0.*
    "zarr>=2.16.0,<3.0.0a0", # not yet compatible with 3.0.*
]
fcs = [
    "readfcs>=2.0.1",
]
dev = [
    # basic test
    "tomlkit",
    "line_profiler",
    "pre-commit",
    "nox",
    "laminci>=0.3",
    "pytest>=6.0",
    "coverage",
    "pytest-cov<7.0.0",  # v7 drops support for subprocess measurement
    "mudata",
    # others
    "nbproject_test>=0.6.0",
    # biology
    "faker-biology",
    # bionty
    "pronto",
]

[tool.flit.module]
name = "lamindb"

[tool.ruff]
src = ["src"]
line-length = 88
lint.select = [
    "F",  # Errors detected by Pyflakes
    "E",  # Error detected by Pycodestyle
    "W",  # Warning detected by Pycodestyle
    "I",  # isort
    "D",  # pydocstyle
    "B",  # flake8-bugbear
    "TID",  # flake8-tidy-imports
    "C4",  # flake8-comprehensions
    "BLE",  # flake8-blind-except
    "UP",  # pyupgrade
    "RUF100",  # Report unused noqa directives
    "TCH",  # Typing imports
    "NPY",  # Numpy specific rules
    "PTH",  # Use pathlib
    "S"  # Security
]
lint.ignore = [
    # Do not catch blind exception: `Exception`
    "BLE001",
    # Errors from function calls in argument defaults. These are fine when the result is immutable.
    "B008",
    # line too long -> we accept long comment lines; black gets rid of long code lines
    "E501",
    # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient
    "E731",
    # allow I, O, l as variable names -> I is the identity matrix
    "E741",
    # Missing docstring in public module
    "D100",
    # undocumented-public-class
    "D101",
    # Missing docstring in public method
    "D102",
    # Missing docstring in public function
    "D103",
    # Missing docstring in public package
    "D104",
    # __magic__ methods are are often self-explanatory, allow missing docstrings
    "D105",
    # Missing docstring in public nested class
    "D106",
    # Missing docstring in __init__
    "D107",
    "D405",
    "D214",
    "D416",
    ## Disable one in each pair of mutually incompatible rules
    # We don’t want a blank line before a class docstring
    "D203",
    # 1 blank line required after class docstring
    "D204",
    # first line should end with a period [Bug: doesn't work with single-line docstrings]
    # We want docstrings to start immediately after the opening triple quote
    "D213",
    # Section underline is over-indented ("{name}")
    "D215",
    # First line should be in imperative mood; try rephrasing
    "D401",
    # First word of the first line should be capitalized: {} -> {}
    "D403",
    # First word of the docstring should not be "This"
    "D404",
    # Section name should end with a newline ("{name}")
    "D406",
    # Missing dashed underline after section ("{name}")
    "D407",
    # Section underline should be in the line following the section's name ("{name}")
    "D408",
    # Section underline should match the length of its name ("{name}")
    "D409",
    # No blank lines allowed between a section header and its content ("{name}")
    "D412",
    # Missing blank line after last section ("{name}")
    "D413",
    # Missing argument description in the docstring
    "D417",
    # camcelcase imported as lowercase
    "N813",
    # module import not at top level of file
    "E402",
    # open()` should be replaced by `Path.open()
    "PTH123",
    # subprocess` call: check for execution of untrusted input - https://github.com/PyCQA/bandit/issues/333
    "S603",
    # Starting a process with a partial executable path
    "S607",
    # Prefer absolute imports over relative imports from parent modules
    "TID252",
    # Asserts
    "S101",
    # Standard pseudo-random generators are not suitable for cryptographic purposes
    "S311",
    # Starting a process with a shell: seems safe, but may be changed in the future; consider rewriting without `shell`
    "S605",
    # Possible SQL injection vector through string-based query construction
    "S608",
    # All of the below TODO 3.10 refactor, temporarily disable
    "S602",
    "UP007",
    "UP038",
    "B905",
    "UP035",
    "RUF100",
]

[tool.ruff.lint.pydocstyle]
convention = "google"

[tool.ruff.lint.per-file-ignores]
"docs/*" = ["I", "S101"]
"tests/**/*.py" = [
    "D",  # docstrings are allowed to look a bit off
    "S101", # asserts allowed in tests...
    "ARG", # Unused function args -> fixtures nevertheless are functionally relevant...
    "FBT", # Don't care about booleans as positional arguments in tests, e.g. via @pytest.mark.parametrize()
    "PLR2004", # Magic value used in comparison, ...
    "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes
]
"tests/**/*.ipynb" = ["S101"]
"*/__init__.py" = ["F401"]
"lamindb/core/types.py" = ["F401"]

[tool.pytest.ini_options]
testpaths = [
    "tests",
]
filterwarnings = [
    "ignore::SyntaxWarning:pronto",
    "ignore:::pronto.ontology",
    "ignore::UserWarning:xarray_schema",
    "ignore::DeprecationWarning:botocore.*",
    "ignore::DeprecationWarning:xarray_schema",
    "ignore::DeprecationWarning:geopandas",
    "ignore::DeprecationWarning:tiledbsoma",
    "ignore::DeprecationWarning:pkg_resources",
    "ignore::FutureWarning:spatialdata",
    "ignore::FutureWarning:mudata",
    "ignore::UserWarning:anndata",
    "ignore:Jupyter is migrating its paths to use standard platformdirs:DeprecationWarning",
    "ignore:The 'train_dataloader' does not have many workers:UserWarning",
]
markers = [
    "pg_integration: tests that require an external PostgreSQL instance"
]

[tool.coverage.report]
exclude_lines = [
    "if TYPE_CHECKING:",
    "@abstractmethod",
    "@abc.abstractmethod"
]

[tool.coverage.run]
omit = ["**/examples/datasets/*", "**/migrations/*", "**/curators/_legacy.py", "**/core/_compat.py", "**/core/types.py"]

[tool.flit.sdist]
exclude = [
    "sub/"
]


================================================
FILE: scripts/migrate_test_instances.py
================================================
#!/usr/bin/env python3
"""Migrate all LaminDB instances used in lamindb tests.

For each instance: connect, run migrations, create storage snapshot.
Run from repo root with: python scripts/migrate_test_instances.py
"""

import subprocess
import sys

INSTANCES = [
    "laminlabs/lamin-site-assets",
    "laminlabs/lamin-dev",
    "laminlabs/lamindata",
    "laminlabs/cellxgene",
    "laminlabs/bionty-assets",
    "laminlabs/pertdata",
]


def run(cmd: str) -> None:
    result = subprocess.run(cmd, shell=True)
    if result.returncode != 0:
        sys.exit(result.returncode)


def main() -> None:
    for instance in INSTANCES:
        print(f"=== Migrating {instance} ===")
        run(f"lamin connect {instance}")
        run("lamin migrate deploy")
        run("lamin io snapshot")
        print()

    print("Done. All test instances migrated and snapshotted.")


if __name__ == "__main__":
    main()


================================================
FILE: tests/core/_dataset_fixtures.py
================================================
from pathlib import Path
from typing import Generator

import lamindb as ln
import numpy as np
import pandas as pd
import pytest
from scipy.sparse import csr_matrix


@pytest.fixture(scope="session")
def get_small_adata():
    # shouldn't need anndata installed to run tests
    import anndata as ad

    return ad.AnnData(
        X=np.array([[1, 2, 3], [4, 5, 6]]),
        obs={"feat1": ["A", "B"]},
        var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]),
        obsm={"X_pca": np.array([[1, 2], [3, 4]])},
    )


@pytest.fixture(scope="session")
def get_small_mdata():
    # shouldn't need mudata installed to run tests
    import anndata as ad
    import mudata as md

    adata1 = ad.AnnData(
        X=np.array([[1, 2, 3], [4, 5, 6]]),
        obs={"feat1": ["A", "B"]},
        var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]),
        obsm={"X_pca": np.array([[1, 2], [3, 4]])},
    )

    adata2 = ad.AnnData(
        X=np.array([[7, 8], [9, 10]]),
        obs={"feat2": ["C", "D"]},
        var=pd.DataFrame(index=["FOXP3", "CD8A"]),
        obsm={"X_umap": np.array([[5, 6], [7, 8]])},
    )

    return md.MuData({"rna": adata1, "protein": adata2})


@pytest.fixture(scope="session")
def get_small_sdata():
    # shouldn't need spatialdata installed to run tests
    import anndata as ad
    import spatialdata as sd

    adata = ad.AnnData(
        X=csr_matrix(np.array([[0.1, 0.2], [0.3, 0.4]])),
        obs=pd.DataFrame(index=["cell1", "cell2"]),
        var=pd.DataFrame(index=["gene1", "gene2"]),
    )

    {
        "region1": np.array([[[0, 0], [0, 1], [1, 1], [1, 0]]]),
        "region2": np.array([[[2, 2], [2, 3], [3, 3], [3, 2]]]),
    }

    sdata_obj = sd.SpatialData(
        tables={"gene_expression": adata},
    )

    return sdata_obj


@pytest.fixture(scope="session")
def get_mini_csv() -> Generator[Path, None, None]:
    csv_path = ln.examples.datasets.file_mini_csv()
    yield csv_path

    Path("mini.csv").unlink(missing_ok=True)


================================================
FILE: tests/core/conftest.py
================================================
import os
import shutil
from pathlib import Path
from subprocess import DEVNULL, run
from time import perf_counter

import anndata as ad
import lamindb as ln
import lamindb_setup as ln_setup
import numpy as np
import pandas as pd
import pytest

# for artifact fixtures
import yaml  # type: ignore
from lamin_utils import logger
from laminci.db import setup_local_test_postgres


def pytest_sessionstart():
    t_execute_start = perf_counter()
    ln_setup._TESTING = True
    os.environ["LAMIN_TESTING"] = "true"
    is_postgresql = os.getenv("LAMINDB_TEST_DB_VENDOR") == "postgresql"
    if is_postgresql:
        print("running tests on PostgreSQL")
    else:
        os.environ["LAMINDB_TEST_DB_VENDOR"] = "sqlite"
        print("running tests on SQLite")
    if is_postgresql is False:
        ln.setup.init(
            storage="./default_storage_unit_core",
            modules="bionty",
            name="lamindb-unit-tests-core",
        )
    else:
        try:
            pgurl = setup_local_test_postgres()
        except RuntimeError:
            run("docker stop pgtest && docker rm pgtest", shell=True, stdout=DEVNULL)  # noqa: S602
            pgurl = setup_local_test_postgres()
        ln.setup.init(
            storage="./default_storage_unit_core",
            modules="bionty",
            name="lamindb-unit-tests-core",
            db=pgurl,
        )

    ln.settings.creation.artifact_silence_missing_run_warning = True
    total_time_elapsed = perf_counter() - t_execute_start
    print(f"time to setup the instance: {total_time_elapsed:.1f}s")


def pytest_sessionfinish(session: pytest.Session):
    logger.set_verbosity(1)
    shutil.rmtree("./default_storage_unit_core")
    ln.setup.delete("lamindb-unit-tests-core", force=True)
    del os.environ["LAMIN_TESTING"]
    if not os.getenv("LAMINDB_TEST_DB_VENDOR") == "sqlite":
        run("docker stop pgtest && docker rm pgtest", shell=True, stdout=DEVNULL)  # noqa: S602


@pytest.fixture
def ccaplog(caplog) -> pytest.LogCaptureFixture:
    """Add caplog handler to our custom logger at session start."""
    from lamin_utils._logger import logger

    logger.addHandler(caplog.handler)

    yield caplog

    logger.removeHandler(caplog.handler)


@pytest.fixture(
    scope="function",
    params=[
        # tuple of is_in_registered_storage, path, suffix, hash of test_dir
        (True, "./default_storage_unit_core/", ".csv", "iGtHiFEBV3r1_TFovdQCgw"),
        (True, "./default_storage_unit_core/", "", "iGtHiFEBV3r1_TFovdQCgw"),
        (True, "./registered_storage/", ".csv", "iGtHiFEBV3r1_TFovdQCgw"),
        (True, "./registered_storage/", "", "iGtHiFEBV3r1_TFovdQCgw"),
        (False, "./nonregistered_storage/", ".csv", "iGtHiFEBV3r1_TFovdQCgw"),
        (False, "./nonregistered_storage/", "", "iGtHiFEBV3r1_TFovdQCgw"),
    ],
)
def get_test_filepaths(request):  # -> Tuple[bool, Path, Path, Path, str]
    is_in_registered_storage: bool = request.param[0]
    root_dir: Path = Path(request.param[1])
    suffix: str = request.param[2]
    hash_test_dir: str = request.param[3]
    if is_in_registered_storage:
        # ensure that it's actually registered
        if ln.Storage.filter(root=root_dir.resolve().as_posix()).one_or_none() is None:
            ln.Storage(root=root_dir.resolve().as_posix(), type="local").save()
    else:
        assert (
            ln.Storage.filter(root=root_dir.resolve().as_posix()).one_or_none() is None
        )
    test_dirpath = root_dir / "my_dir/"
    test_dirpath.mkdir(parents=True, exist_ok=True)
    # create a first file
    test_filepath0 = test_dirpath / f"my_file{suffix}"
    test_filepath0.write_text("0")
    # create a second, duplicated file
    test_filepath1 = test_dirpath / f"my_file1{suffix}"
    test_filepath1.write_text("0")
    # create a non-duplicated file
    test_filepath2 = test_dirpath / f"my_file2{suffix}"
    test_filepath2.write_text("1")
    # return a boolean indicating whether test filepath is in default storage
    # and the test filepath
    yield (
        is_in_registered_storage,
        root_dir,
        test_dirpath,
        test_filepath0,
        suffix,
        hash_test_dir,
    )
    shutil.rmtree(test_dirpath)


@pytest.fixture(scope="function")
def registered_storage_file_and_folder():
    root_dir = Path("./registered_storage_suffix_fixture")
    storage_root = root_dir.resolve().as_posix()
    if ln.Storage.filter(root=storage_root).one_or_none() is None:
        ln.Storage(root=storage_root, type="local").save()

    test_dirpath = root_dir / "suffix_fixture_dir"
    test_dirpath.mkdir(parents=True, exist_ok=True)
    test_filepath = test_dirpath / "suffix_fixture_file.csv"
    test_filepath.write_text("a,b\n1,2\n")

    folder_path = root_dir / "suffix_fixture_folder"
    folder_path.mkdir(parents=True, exist_ok=True)
    (folder_path / "nested.txt").write_text("content")

    yield test_filepath, folder_path

    shutil.rmtree(test_dirpath, ignore_errors=True)
    shutil.rmtree(folder_path, ignore_errors=True)


@pytest.fixture(scope="session")
def example_dataframe():
    return pd.DataFrame({"feat1": [1, 2], "feat2": [3, 4]})


@pytest.fixture(scope="session")
def adata_file():
    adata = ad.AnnData(
        X=np.array([[1, 2, 3], [4, 5, 6]]),
        obs={"feat1": ["A", "B"]},
        var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]),
        obsm={"X_pca": np.array([[1, 2], [3, 4]])},
    )
    filepath = Path("adata_file.h5ad")
    adata.write(filepath)
    yield "adata_file.h5ad"
    filepath.unlink()


@pytest.fixture(scope="session")
def tsv_file():
    filepath = Path("test.tsv")
    pd.DataFrame([1, 2]).to_csv(filepath, sep="\t")
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="session")
def zip_file():
    filepath = Path("test.zip")
    pd.DataFrame([1, 2]).to_csv(filepath, sep="\t")
    yield filepath
    filepath.unlink(missing_ok=True)


@pytest.fixture(scope="session")
def yaml_file():
    filepath = Path("test.yaml")
    dct = {"a": 1, "b": 2}
    with open(filepath, "w") as f:
        yaml.dump(dct, f)
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="session")
def fcs_file():
    fcs_path = ln.examples.datasets.file_fcs_alpert19()
    yield fcs_path
    fcs_path.unlink()


@pytest.fixture(scope="session")
def mudata_file(get_small_mdata):
    filepath = Path("test.h5mu")
    get_small_mdata.write(filepath)
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="session")
def spatialdata_file(get_small_sdata):
    filepath = Path("test.zarr")
    get_small_sdata.write(filepath)
    yield filepath
    shutil.rmtree(filepath)


================================================
FILE: tests/core/notebooks/basic-r-notebook.Rmd.cleaned.html
================================================
<!doctype html>
<html>
  <meta charset="utf-8" />
  
  
  <!-- rnb-text-begin -->
  <!-- rnb-text-end -->
  <!-- rnb-chunk-begin -->
  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHViR2xpY21GeWVTaHNZVzFwYm5JcFhHNWNibVJpSUR3dElHTnZibTVsWTNRb0tWeHVZR0JnSW4wPSAtLT5cblxuYGBgclxubGlicmFyeShsYW1pbnIpXG5cbmRiIDwtIGNvbm5lY3QoKVxuYGBgXG5cbjwhLS0gcm5iLXNvdXJjZS1lbmQgLS0+XG4ifQ== -->
  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxubGlicmFyeShsYW1pbnIpXG5cbmRiIDwtIGNvbm5lY3QoKVxuYGBgIn0= -->
  <pre class="r"><code>library(laminr)

db &lt;- connect()</code></pre>
  <!-- rnb-source-end -->
  <!-- rnb-output-end -->
  <!-- rnb-output-begin eyJkYXRhIjoi4oaSIGNvbm5lY3RlZCBsYW1pbmRiOiBsYW1pbmxhYnMvbGFtaW5kYXRhXG4ifQ== -->
  <pre><code>→ connected lamindb: laminlabs/lamindata</code></pre>
  <!-- rnb-output-end -->
  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHVaR0lrZEhKaFkyc29YQ0pzVDFOamRYaEVWRVJGTUhFd01EQXdYQ0lwWEc1Z1lHQWlmUT09IC0tPlxuXG5gYGByXG5kYiR0cmFjayhcImxPU2N1eERUREUwcTAwMDBcIilcbmBgYFxuXG48IS0tIHJuYi1zb3VyY2UtZW5kIC0tPlxuIn0= -->
  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuZGIkdHJhY2soXCJsT1NjdXhEVERFMHEwMDAwXCIpXG5gYGAifQ== -->
  <pre class="r"><code>db$track(&quot;lOScuxDTDE0q0000&quot;)</code></pre>
  <!-- rnb-source-end -->
  <!-- rnb-output-end -->
  <!-- rnb-output-begin eyJkYXRhIjoi4oaSIGxvYWRlZCBUcmFuc2Zvcm0oJ2xPU2N1eERUJyksIHN0YXJ0ZWQgUnVuKCdHV3BhVHRVZycpIGF0IDIwMjQtMTItMDEgMTc6NDk6MTggVVRDXG4ifQ== -->
  <pre><code>→ loaded Transform(&#39;lOScuxDT&#39;), started Run(&#39;GWpaTtUg&#39;) at 2024-12-01 17:49:18 UTC</code></pre>
  <!-- rnb-output-end -->
  <!-- rnb-chunk-end -->
  <!-- rnb-text-begin -->
  <!-- rnb-text-end -->
  <!-- rnb-chunk-begin -->
  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHVaR0lrWm1sdWFYTm9LQ2xjYm1CZ1lDSjkgLS0+XG5cbmBgYHJcbmRiJGZpbmlzaCgpXG5gYGBcblxuPCEtLSBybmItc291cmNlLWVuZCAtLT5cbiJ9 -->
  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuZGIkZmluaXNoKClcbmBgYCJ9 -->
  <pre class="r"><code>db$finish()</code></pre>
  <!-- rnb-source-end -->
  <!-- rnb-output-end -->
  <!-- rnb-output-begin eyJkYXRhIjoiRXJyb3IgaW4gcHlfY2FsbF9pbXBsKGNhbGxhYmxlLCBjYWxsX2FyZ3MkdW5uYW1lZCwgY2FsbF9hcmdzJG5hbWVkKSA6IFxuICBsYW1pbmRiLmNvcmUuZXhjZXB0aW9ucy5Ob3RlYm9va05vdFNhdmVkOiBQbGVhc2Ugc2F2ZSB0aGUgbm90ZWJvb2sgaW4gUlN0dWRpbyAoc2hvcnRjdXQgYENNRCArIHNgKSB3aXRoaW4gMiBzZWMgYmVmb3JlIGNhbGxpbmcgYGRiJGZpbmlzaCgpYFxuUnVuIFx1MDAxYl04Oztyc3R1ZGlvOnJ1bjpyZXRpY3VsYXRlOjpweV9sYXN0X2Vycm9yKClcdTAwMDdgcmV0aWN1bGF0ZTo6cHlfbGFzdF9lcnJvcigpYFx1MDAxYl04OztcdTAwMDcgZm9yIGRldGFpbHMuXG4ifQ== -->
  <pre><code>MoreOUTPUT </code></pre>
  <!-- rnb-output-end -->
  <!-- rnb-chunk-end -->
  <!-- rnb-text-begin -->
</html>


================================================
FILE: tests/core/notebooks/basic-r-notebook.Rmd.html
================================================
<!doctype html>
<html>
  <meta charset="utf-8" />
  <title>My exemplary R analysis</title>
  <h1 class="title toc-ignore">My exemplary R analysis</h1>

  <!-- rnb-text-begin -->
  <!-- rnb-text-end -->
  <!-- rnb-chunk-begin -->
  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHViR2xpY21GeWVTaHNZVzFwYm5JcFhHNWNibVJpSUR3dElHTnZibTVsWTNRb0tWeHVZR0JnSW4wPSAtLT5cblxuYGBgclxubGlicmFyeShsYW1pbnIpXG5cbmRiIDwtIGNvbm5lY3QoKVxuYGBgXG5cbjwhLS0gcm5iLXNvdXJjZS1lbmQgLS0+XG4ifQ== -->
  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxubGlicmFyeShsYW1pbnIpXG5cbmRiIDwtIGNvbm5lY3QoKVxuYGBgIn0= -->
  <pre class="r"><code>library(laminr)

db &lt;- connect()</code></pre>
  <!-- rnb-source-end -->
  <!-- rnb-output-end -->
  <!-- rnb-output-begin eyJkYXRhIjoi4oaSIGNvbm5lY3RlZCBsYW1pbmRiOiBsYW1pbmxhYnMvbGFtaW5kYXRhXG4ifQ== -->
  <pre><code>→ connected lamindb: laminlabs/lamindata</code></pre>
  <!-- rnb-output-end -->
  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHVaR0lrZEhKaFkyc29YQ0pzVDFOamRYaEVWRVJGTUhFd01EQXdYQ0lwWEc1Z1lHQWlmUT09IC0tPlxuXG5gYGByXG5kYiR0cmFjayhcImxPU2N1eERUREUwcTAwMDBcIilcbmBgYFxuXG48IS0tIHJuYi1zb3VyY2UtZW5kIC0tPlxuIn0= -->
  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuZGIkdHJhY2soXCJsT1NjdXhEVERFMHEwMDAwXCIpXG5gYGAifQ== -->
  <pre class="r"><code>db$track(&quot;lOScuxDTDE0q0000&quot;)</code></pre>
  <!-- rnb-source-end -->
  <!-- rnb-output-end -->
  <!-- rnb-output-begin eyJkYXRhIjoi4oaSIGxvYWRlZCBUcmFuc2Zvcm0oJ2xPU2N1eERUJyksIHN0YXJ0ZWQgUnVuKCdHV3BhVHRVZycpIGF0IDIwMjQtMTItMDEgMTc6NDk6MTggVVRDXG4ifQ== -->
  <pre><code>→ loaded Transform(&#39;lOScuxDT&#39;), started Run(&#39;GWpaTtUg&#39;) at 2024-12-01 17:49:18 UTC</code></pre>
  <!-- rnb-output-end -->
  <!-- rnb-chunk-end -->
  <!-- rnb-text-begin -->
  <!-- rnb-text-end -->
  <!-- rnb-chunk-begin -->
  <!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHVaR0lrWm1sdWFYTm9LQ2xjYm1CZ1lDSjkgLS0+XG5cbmBgYHJcbmRiJGZpbmlzaCgpXG5gYGBcblxuPCEtLSBybmItc291cmNlLWVuZCAtLT5cbiJ9 -->
  <!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuZGIkZmluaXNoKClcbmBgYCJ9 -->
  <pre class="r"><code>db$finish()</code></pre>
  <!-- rnb-source-end -->
  <!-- rnb-output-end -->
  <!-- rnb-output-begin eyJkYXRhIjoiRXJyb3IgaW4gcHlfY2FsbF9pbXBsKGNhbGxhYmxlLCBjYWxsX2FyZ3MkdW5uYW1lZCwgY2FsbF9hcmdzJG5hbWVkKSA6IFxuICBsYW1pbmRiLmNvcmUuZXhjZXB0aW9ucy5Ob3RlYm9va05vdFNhdmVkOiBQbGVhc2Ugc2F2ZSB0aGUgbm90ZWJvb2sgaW4gUlN0dWRpbyAoc2hvcnRjdXQgYENNRCArIHNgKSB3aXRoaW4gMiBzZWMgYmVmb3JlIGNhbGxpbmcgYGRiJGZpbmlzaCgpYFxuUnVuIFx1MDAxYl04Oztyc3R1ZGlvOnJ1bjpyZXRpY3VsYXRlOjpweV9sYXN0X2Vycm9yKClcdTAwMDdgcmV0aWN1bGF0ZTo6cHlfbGFzdF9lcnJvcigpYFx1MDAxYl04OztcdTAwMDcgZm9yIGRldGFpbHMuXG4ifQ== -->
  <pre><code>MoreOUTPUT ! please hit SHORTCUT to save the notebook in your editor and re-run finish()</code></pre>
  <!-- rnb-output-end -->
  <!-- rnb-chunk-end -->
  <!-- rnb-text-begin -->
</html>


================================================
FILE: tests/core/notebooks/duplicate/with-title-initialized-consecutive-finish.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# My duplicated test notebook (consecutive) with `ln.finish()`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This has actually different content than the original one in the `notebooks/` folder."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lamindb as ln\n",
    "\n",
    "ln.track()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py310",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: tests/core/notebooks/load_schema.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import lamindb as ln"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# this is a test case because we had an issue with path resolution at some point: https://github.com/laminlabs/lamindb/pull/3211\n",
    "valid_features = ln.examples.schemas.valid_features()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_features.delete(permanent=True)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: tests/core/notebooks/no-title.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {},
   "source": [
    "A notebook without title."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import lamindb as ln"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pass stem uid\n",
    "ln.track(\"123456789ABC\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert ln.context.transform.description is None\n",
    "assert ln.context.transform.key == \"no-title.ipynb\""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py312",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  },
  "nbproject": {
   "id": "Irn3xQyQ40GU",
   "pypackage": {
    "nbproject": "0.0.7+2.g8521e30"
   },
   "time_init": "2022-06-08T14:42:31.551211+00:00",
   "version": "0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: tests/core/notebooks/with-title-initialized-consecutive-finish-not-last-cell.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# My test notebook (consecutive) with `ln.finish()` not in last cell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lamindb as ln"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# do not pass uid purposefully\n",
    "ln.track()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"my consecutive cell\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ln.finish(ignore_non_consecutive=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"my consecutive cell\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py39",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: tests/core/notebooks/with-title-initialized-consecutive-finish.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# My test notebook (consecutive) with `ln.finish()`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lamindb as ln\n",
    "import pytest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pytest.raises(ln.errors.InvalidArgument) as error:\n",
    "    ln.track(\"ujPaFZ\")\n",
    "print(error.exconly())\n",
    "assert error.exconly().startswith(\n",
    "    'lamindb.errors.InvalidArgument: Please pass an auto-generated uid instead of \"ujPaFZ\". Resolve by running:'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# with uid passed\n",
    "ln.track(\"ujPaFZatnMLG0000\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"my consecutive cell\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"my consecutive cell\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ln.finish()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py312",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: tests/core/scripts/duplicate1/script-to-test-versioning.py
================================================
import lamindb as ln

ln.context.version = "1"
ln.track("Ro1gl7n8YrdH0001")


================================================
FILE: tests/core/scripts/duplicate2/script-to-test-versioning.py
================================================
import lamindb as ln

ln.context.version = "2"
ln.track("Ro1gl7n8YrdH0002")

assert ln.context.transform.version_tag == "2"


================================================
FILE: tests/core/scripts/duplicate3/script-to-test-versioning.py
================================================
import lamindb as ln

ln.context.version = "3"
ln.track("Ro1gl7n8YrdH0002")


================================================
FILE: tests/core/scripts/duplicate4/script-to-test-versioning.py
================================================
import lamindb as ln

ln.track()


================================================
FILE: tests/core/scripts/duplicate5/script-to-test-versioning.py
================================================
import lamindb as ln

# different from the one in duplicate4
ln.track()

ln.finish()


================================================
FILE: tests/core/scripts/script-to-test-filename-change.py
================================================
import lamindb as ln

ln.track("Ro1gl7n8YrdH0001")


================================================
FILE: tests/core/scripts/script-to-test-versioning.py
================================================
import lamindb as ln

ln.context.version = "1"
ln.track("Ro1gl7n8YrdH0000")


================================================
FILE: tests/core/test_artifact_anndata_with_curation.py
================================================
import lamindb as ln


def test_create_anndata_with_curation():
    adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData")
    feature1 = ln.Feature(name="sample_note", dtype=str).save()

    # ingest the first time
    artifact = ln.Artifact.from_anndata(
        adata,
        key="examples/mini_immuno1.h5ad",
        schema="ensembl_gene_ids_and_valid_features_in_obs",
    ).save()
    # capture the obs_schema because we'll overwrite it
    obs_schema = artifact.features.slots["obs"]

    # define another feature so that upon re-ingestion, we track more than before
    # (this also tests non-trivial idempotency)
    feature2 = ln.Feature(name="treatment_time_h", dtype=int).save()
    artifact = ln.Artifact.from_anndata(
        adata,
        key="examples/mini_immuno1.h5ad",
        schema="ensembl_gene_ids_and_valid_features_in_obs",
    ).save()

    schemas = artifact.features.slots
    artifact.delete(permanent=True)
    for schema in schemas.values():
        schema.delete(permanent=True)
    obs_schema.delete(permanent=True)
    feature1.delete(permanent=True)
    feature2.delete(permanent=True)


================================================
FILE: tests/core/test_artifact_basics.py
================================================
"""Artifact tests.

Also see `test_artifact_folders.py` for tests of folder-like artifacts.
"""

# ruff: noqa: F811

import shutil
import sys
from pathlib import Path, PurePosixPath
from types import ModuleType, SimpleNamespace
from unittest.mock import patch

import anndata as ad
import h5py
import lamindb as ln
import lamindb_setup
import mudata as md
import pandas as pd
import pytest
import zarr
from _dataset_fixtures import (  # noqa
    get_mini_csv,
    get_small_adata,
    get_small_mdata,
    get_small_sdata,
)
from lamindb.core.loaders import load_fcs, load_to_memory, load_tsv
from lamindb.core.storage.paths import (
    AUTO_KEY_PREFIX,
    auto_storage_key_from_artifact_uid,
    check_path_is_child_of_root,
    delete_storage,
)
from lamindb.errors import (
    FieldValidationError,
    InvalidArgument,
)
from lamindb.models.artifact import (
    data_is_scversedatastructure,
    get_relative_path_to_directory,
    process_data,
)
from lamindb_setup.core.upath import (
    CloudPath,
    LocalPathClasses,
    UPath,
    extract_suffix_from_path,
)

# how do we properly abstract out the default storage variable?
# currently, we're only mocking it through `storage` as set in conftest.py

ln.settings.verbosity = "success"


@pytest.fixture
def data(request):
    if request.param == "get_small_adata":
        return request.getfixturevalue("get_small_adata")
    else:
        return request.param


# -------------------------------------------------------------------------------------
# Basic construction
# -------------------------------------------------------------------------------------


def test_basic_validation():
    # extra kwargs
    with pytest.raises(FieldValidationError):
        ln.Artifact("testpath.csv", description="test1b", extra_kwarg="extra")

    # > 1 args
    with pytest.raises(ValueError) as error:
        ln.Artifact("testpath.csv", "testpath.csv")
    assert error.exconly() == "ValueError: Only one non-keyword arg allowed: path"

    # AUTO_KEY_PREFIX in key
    with pytest.raises(ValueError) as error:
        ln.Artifact(".gitignore", key=".lamindb/test_df.parquet")
    assert (
        error.exconly()
        == f"ValueError: Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`"
    )

    # path that contains AUTO_KEY_PREFIX
    with pytest.raises(ValueError) as error:
        ln.Artifact(".lamindb/test_df.parquet", description="Test")
    assert (
        error.exconly()
        == f"ValueError: Do not pass path inside the `{AUTO_KEY_PREFIX}` directory."
    )


@pytest.mark.parametrize("key_is_virtual", [True, False])
@pytest.mark.parametrize("key", [None, "my_new_dir/my_artifact.csv", "nosuffix"])
@pytest.mark.parametrize("description", [None, "my description"])
def test_create_from_path_file(get_test_filepaths, key_is_virtual, key, description):
    ln.settings.creation._artifact_use_virtual_keys = key_is_virtual
    is_in_registered_storage = get_test_filepaths[0]
    root_dir = get_test_filepaths[1]
    test_filepath = get_test_filepaths[3]
    suffix = get_test_filepaths[4]  # path suffix
    if key is not None:
        key_suffix = extract_suffix_from_path(
            PurePosixPath(key), arg_name="key"
        )  # key suffix
    else:
        key_suffix = None
    # this tests if insufficient information is being provided
    if key is None and not is_in_registered_storage and description is None:
        # this can fail because ln.track() might set a global run context
        # in that case, the Artifact would have a run that's not None and the
        # error below wouldn't be thrown
        with pytest.raises(ValueError) as error:
            artifact = ln.Artifact(test_filepath, key=key, description=description)
        assert (
            error.exconly()
            == "ValueError: Pass one of key, run or description as a parameter"
        )
        return None
    elif key is not None and suffix != key_suffix:
        with pytest.raises(InvalidArgument) as error:
            artifact = ln.Artifact(test_filepath, key=key, description=description)
        assert error.exconly() == (
            f"lamindb.errors.InvalidArgument: The passed path's suffix '{suffix}' must match the passed key's suffix '{key_suffix}'."
        )
        return None
    elif key is not None and is_in_registered_storage:
        inferred_key = get_relative_path_to_directory(
            path=test_filepath, directory=root_dir
        ).as_posix()
        try:
            artifact = ln.Artifact(test_filepath, key=key, description=description)
        except InvalidArgument as error:
            assert str(error) == (
                f"The path '{test_filepath}' is already in registered storage"
                f" '{root_dir.resolve().as_posix()}' with key '{inferred_key}'\nYou"
                f" passed conflicting key '{key}': please move the file before"
                " registering it."
            )
        return None
    else:
        artifact = ln.Artifact(test_filepath, key=key, description=description)
        assert artifact._state.adding  # make sure that this is a new file in the db
    assert (
        artifact.description is None
        if description is None
        else artifact.description == description
    )
    assert artifact.suffix == suffix
    assert artifact.n_files is None
    artifact.save()
    assert artifact.path.exists()
    # check get by path
    assert ln.Artifact.get(path=artifact.path) == artifact

    if key is None:
        assert (
            artifact.key == f"my_dir/my_file{suffix}"
            if is_in_registered_storage
            else artifact.key is None
        )
        if is_in_registered_storage:
            assert artifact.storage.root == root_dir.resolve().as_posix()
            assert artifact.path == test_filepath.resolve()
        else:
            assert artifact.storage.root == lamindb_setup.settings.storage.root_as_str
            assert (
                artifact.path
                == lamindb_setup.settings.storage.root
                / f".lamindb/{artifact.uid}{suffix}"
            )
    else:
        assert artifact.key == key
        assert artifact._key_is_virtual == key_is_virtual
        if is_in_registered_storage:
            # this would only hit if the key matches the correct key
            assert artifact.storage.root == root_dir.resolve().as_posix()
            assert (
                artifact.path == root_dir / f"{key}{suffix}" == test_filepath.resolve()
            )
        else:
            # file is moved into default storage
            if key_is_virtual:
                assert (
                    artifact.path
                    == lamindb_setup.settings.storage.root
                    / f".lamindb/{artifact.uid}{suffix}"
                )
            else:
                assert artifact.path == lamindb_setup.settings.storage.root / key
    # only delete from storage if a file copy took place
    delete_from_storage = str(test_filepath.resolve()) != str(artifact.path)
    artifact.delete(permanent=True, storage=delete_from_storage)
    ln.settings.creation._artifact_use_virtual_keys = True


@pytest.mark.parametrize("key_is_virtual", [True, False])
@pytest.mark.parametrize("key", [None, "my_new_file.tsv"])
def test_create_from_path_file_with_explicit_key_is_virtual(
    tsv_file, key_is_virtual, key
):
    artifact = ln.Artifact(
        tsv_file,
        description="test explicit key is virtual",
        key=key,
        _key_is_virtual=key_is_virtual,
    )
    assert artifact.key == key
    assert artifact._key_is_virtual == key_is_virtual
    artifact.save()
    assert artifact.path.exists()

    root = lamindb_setup.settings.storage.root
    if not key_is_virtual and key is not None:
        assert artifact.path == root / key
    else:
        assert artifact.path == root / f".lamindb/{artifact.uid}.tsv"

    artifact.delete(permanent=True, storage=True)


def test_create_from_empty_files_skips_hash_lookup(tmp_path):
    path_1 = tmp_path / "empty-1.txt"
    path_2 = tmp_path / "empty-2.txt"
    path_1.write_text("")
    path_2.write_text("")

    artifact_1 = ln.Artifact(path_1, key=f"{tmp_path.name}/empty-1.txt").save()
    artifact_2 = ln.Artifact(path_2, key=f"{tmp_path.name}/empty-2.txt")

    assert artifact_2.uid != artifact_1.uid
    assert artifact_2.key == f"{tmp_path.name}/empty-2.txt"
    assert artifact_2.hash == artifact_1.hash

    artifact_2.save()
    assert artifact_2.id != artifact_1.id

    artifact_2.delete(permanent=True)
    artifact_1.delete(permanent=True)


@pytest.mark.parametrize("key", [None, "my_new_folder"])
def test_create_from_path_folder(get_test_filepaths, key):
    # get variables from fixture
    is_in_registered_storage = get_test_filepaths[0]
    test_dirpath = get_test_filepaths[2]
    hash_test_dir = get_test_filepaths[5]
    if key is None and not is_in_registered_storage:
        with pytest.raises(ValueError) as error:
            ln.Artifact(test_dirpath, key=key)
        assert error.exconly().startswith(
            "ValueError: Pass one of key, run or description as a parameter"
        )
        return None
    artifact1 = ln.Artifact(test_dirpath, key=key)
    if key is not None and is_in_registered_storage:
        assert artifact1._real_key is not None
        # should fail because we are passing a path in an existing storage with a virtual key
        with pytest.raises(ValueError) as error:
            ln.Artifact(test_dirpath, key=key, _key_is_virtual=False)
        assert error.exconly().startswith(
            "ValueError: Passing a path in an existing storage with a virtual key and _key_is_virtual=False is incompatible."
        )
    else:
        assert artifact1._real_key is None
    # check that passing _key_is_virtual=True is incompatible with a path in an existing storage without a virtual key
    if key is None and is_in_registered_storage:
        with pytest.raises(ValueError) as error:
            ln.Artifact(test_dirpath, key=key, _key_is_virtual=True)
        assert error.exconly().startswith(
            "ValueError: Passing a path in an existing storage without a virtual key and _key_is_virtual=True is incompatible."
        )
    assert artifact1.n_files == 3
    assert artifact1.hash == hash_test_dir
    assert artifact1._state.adding
    assert artifact1.description is None
    assert artifact1.path.exists()
    artifact1.save()

    # run tests on re-creating the Artifact
    artifact2 = ln.Artifact(test_dirpath, key=key, description="something")
    assert not artifact2._state.adding
    assert artifact1.id == artifact2.id
    assert artifact1.uid == artifact2.uid
    assert artifact1.storage == artifact2.storage
    assert artifact2.path.exists()
    assert artifact2.description == "something"

    # now put another file in the test directory

    # create a first file
    test_filepath_added = test_dirpath / "my_file_added.txt"
    test_filepath_added.write_text("2")
    artifact3 = ln.Artifact(test_dirpath, key=key, revises=artifact1)
    assert artifact3.n_files == 4
    assert artifact3.hash != hash_test_dir
    assert artifact3._state.adding
    assert artifact3.description is None
    assert artifact3.path.exists()
    artifact3.save()

    # the state of artifact1 is lost, because artifact3 is stored at the same path
    assert artifact3.overwrite_versions
    assert artifact1.overwrite_versions
    assert artifact3.path == artifact1.path
    test_filepath_added.unlink()

    # delete the artifact
    artifact2.delete(permanent=True, storage=False)
    artifact3.delete(permanent=True, storage=False)


def test_create_from_path_overwrite_versions_false(get_test_filepaths):
    # get variables from fixture
    is_in_registered_storage = get_test_filepaths[0]
    test_dirpath = get_test_filepaths[2]
    hash_test_dir = get_test_filepaths[5]
    if is_in_registered_storage:
        return
    artifact1 = ln.Artifact(
        test_dirpath, key="my_folder", overwrite_versions=False
    ).save()
    assert artifact1.hash == hash_test_dir
    # skip artifact2 because we already test this above
    # create a first file
    test_filepath_added = test_dirpath / "my_file_added.txt"
    test_filepath_added.write_text("2")
    artifact3 = ln.Artifact(test_dirpath, key="my_folder", overwrite_versions=False)
    assert artifact3.hash != hash_test_dir
    artifact3.save()
    # the state of artifact1 is lost, because artifact3 is stored at the same path
    assert not artifact3.overwrite_versions
    assert not artifact1.overwrite_versions
    assert artifact3.path != artifact1.path
    test_filepath_added.unlink()
    artifact1.delete(permanent=True, storage=False)
    artifact3.delete(permanent=True, storage=False)


def test_delete_permanently_from_trash_folder(tmp_path):
    folder_path = tmp_path / "folder-overwrite-versions"
    folder_path.mkdir()
    (folder_path / "v1.txt").write_text("v1")
    key = f"{tmp_path.name}/folder-overwrite-versions"

    artifact = ln.Artifact(folder_path, key=key).save()
    assert artifact.overwrite_versions

    # First soft-delete (move to trash), then delete permanently.
    artifact.delete()
    artifact.refresh_from_db()
    assert artifact.branch_id == -1

    with patch("builtins.input", return_value="y"):
        artifact.delete()

    assert ln.Artifact.objects.filter(uid__startswith=artifact.stem_uid).count() == 0


def test_create_from_path_set_branch():
    branch = ln.Branch(name="contrib1").save()
    artifact1 = ln.Artifact(".gitignore", key="test", branch=branch).save()
    # check hash lookup on different branch
    artifact2 = ln.Artifact(".gitignore", key="test1")
    assert artifact1 == artifact2
    # cleanup
    artifact1.delete(permanent=True)
    branch.delete(permanent=True)


@pytest.mark.parametrize("key", [None, "my_new_folder"])
def test_from_dir(get_test_filepaths, key):
    is_in_registered_storage = get_test_filepaths[0]
    test_dirpath = get_test_filepaths[2]
    # the directory contains 3 files, two of them are duplicated
    artifacts = ln.Artifact.from_dir(test_dirpath, key=key)
    for artifact in artifacts:
        if key is not None and is_in_registered_storage:
            assert artifact._real_key is not None
        else:
            assert artifact._real_key is None
    # we only return the duplicated ones
    hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]
    uids = [artifact.uid for artifact in artifacts]
    assert len(set(hashes)) == len(hashes)
    ln.UPath(test_dirpath).view_tree()
    # now save
    artifacts.save()
    # now run again, because now we'll have hash-based lookup!
    artifacts = ln.Artifact.from_dir(test_dirpath, key=key)
    assert len(artifacts) == 2
    assert len(set(artifacts)) == len(hashes)
    queried_artifacts = ln.Artifact.filter(uid__in=uids)
    for artifact in queried_artifacts:
        artifact.delete(permanent=True, storage=False)


def test_create_from_dataframe(example_dataframe: pd.DataFrame):
    df = example_dataframe
    artifact = ln.Artifact.from_dataframe(df, description="test1")
    assert artifact.description == "test1"
    assert artifact.key is None
    assert artifact.otype == "DataFrame"
    assert artifact.kind == "dataset"
    assert artifact.n_observations == 2
    assert hasattr(artifact, "_local_filepath")
    artifact.key = "my-test-dataset"  # try changing key
    with pytest.raises(ln.errors.InvalidArgument) as error:
        artifact.save()
    assert (
        error.exconly()
        == "lamindb.errors.InvalidArgument: The suffix '' of the provided key is incorrect, it should be '.parquet'."
    )
    artifact.key = None  # restore
    artifact.suffix = ".whatever"  # changing suffix before first save is invalid
    with pytest.raises(
        ln.errors.InvalidArgument,
        match="Cannot update the suffix of an artifact before it is saved.",
    ):
        artifact.save()
    artifact.suffix = ".parquet"
    artifact.save()
    # check that the local filepath has been cleared
    assert not hasattr(artifact, "_local_filepath")
    del artifact

    # now get an artifact from the database
    artifact = ln.Artifact.get(description="test1")
    parquet_path = artifact.path
    assert parquet_path.exists()
    assert parquet_path.suffix == ".parquet"
    # test cancelling the move
    artifact.suffix = ".whatever"
    with patch("builtins.input", return_value="n"):
        assert artifact.save() is None
    assert parquet_path.exists()

    artifact = ln.Artifact.get(description="test1")
    assert artifact.suffix == ".parquet"
    artifact.suffix = ".whatever"
    with patch("builtins.input", return_value="y"):
        artifact.save()
    assert artifact.suffix == ".whatever"
    whatever_path = artifact.path
    assert whatever_path.exists()
    assert whatever_path.suffix == ".whatever"
    assert not parquet_path.exists()
    artifact.suffix = ".parquet"
    with patch("builtins.input", return_value="y"):
        artifact.save()
    assert artifact.suffix == ".parquet"
    parquet_path_restored = artifact.path
    assert parquet_path_restored.exists()
    assert parquet_path_restored.suffix == ".parquet"
    assert not whatever_path.exists()

    # coming from `key is None` that setting a key with different suffix is not allowed
    artifact.key = "my-test-dataset.suffix"
    with pytest.raises(ln.errors.InvalidArgument) as error:
        artifact.save()
    assert (
        error.exconly()
        == "lamindb.errors.InvalidArgument: The suffix '.suffix' of the provided key is incorrect, it should be '.parquet'."
    )

    # coming from `key is None` test with no suffix
    artifact.key = "my-test-dataset"
    with pytest.raises(ln.errors.InvalidArgument) as error:
        artifact.save()
    assert (
        error.exconly()
        == "lamindb.errors.InvalidArgument: The suffix '' of the provided key is incorrect, it should be '.parquet'."
    )

    # virtual key and suffix can now be updated together
    artifact.key = "my-test-dataset"
    artifact.suffix = ""
    with patch("builtins.input", return_value="y"):
        artifact.save()
    assert artifact.suffix == ""
    assert artifact.key == "my-test-dataset"

    # changing the suffix updates the key suffix as well
    artifact.suffix = ".parquet"
    with patch("builtins.input", return_value="y"):
        artifact.save()
    assert artifact.key == "my-test-dataset.parquet"

    # coming from a .parquet key, test changing the key to no suffix
    artifact.key = "my-test-dataset"
    with pytest.raises(ln.errors.InvalidArgument) as error:
        artifact.save()
    assert (
        error.exconly()
        == "lamindb.errors.InvalidArgument: The suffix '' of the provided key is incorrect, it should be '.parquet'."
    )

    artifact.delete(permanent=True)

    # test from_dataframe with a path
    path = Path("test_df_from_path.parquet")
    try:
        example_dataframe.to_parquet(path)
        for path_input in [path, str(path)]:
            artifact = ln.Artifact.from_dataframe(
                path_input, description="test from path"
            )
            assert artifact.description == "test from path"
            assert artifact.otype == "DataFrame"
            assert artifact.kind == "dataset"
            assert artifact.n_observations == 2
            artifact.save()
            artifact.delete(permanent=True)
    finally:
        path.unlink(missing_ok=True)


def test_dataframe_validate_suffix(example_dataframe: pd.DataFrame):
    df = example_dataframe
    artifact = ln.Artifact.from_dataframe(df, key="test_.parquet")
    assert artifact.suffix == ".parquet"

    with pytest.raises(ln.errors.InvalidArgument) as error:
        artifact = ln.Artifact.from_dataframe(df, key="test_.def")
    assert (
        error.exconly().partition(",")[0]
        == "lamindb.errors.InvalidArgument: The passed key's suffix '.def' must match the passed path's suffix '.parquet'."
    )


def test_create_from_parquet_file_default_constructor(
    example_dataframe: pd.DataFrame, ccaplog: pytest.LogCaptureFixture
):
    path = "test_df.parquet"
    example_dataframe.to_parquet(path)
    ln.Artifact(path, key=path)
    assert "data is a DataFrame, please use .from_dataframe()" in ccaplog.text
    Path(path).unlink()


def test_create_from_anndata(get_small_adata, adata_file, example_dataframe):
    with pytest.raises(ValueError) as error:
        ln.Artifact.from_anndata(example_dataframe, description="test1")
    assert (
        "data has to be an AnnData object or a path to AnnData-like" in error.exconly()
    )

    for i, _a in enumerate([get_small_adata, adata_file]):
        artifact = ln.Artifact.from_anndata(_a, description="test1")
        assert artifact.description == "test1"
        assert artifact.key is None
        assert artifact.otype == "AnnData"
        assert artifact.kind == "dataset"
        assert artifact.n_observations == 2
        if i == 0:
            assert hasattr(artifact, "_local_filepath")
            artifact.save()
            # check that the local filepath has been cleared
            assert not hasattr(artifact, "_local_filepath")
            artifact.delete(permanent=True)


def test_from_anndata_uses_h5ad_kwargs(get_small_adata):
    artifact = ln.Artifact.from_anndata(
        get_small_adata,
        key="test_kwargs.h5ad",
        h5ad_kwargs={"compression": "gzip"},
    )

    local_path = artifact._local_filepath
    with h5py.File(local_path, mode="r") as store:
        assert store["X"].compression == "gzip"

    local_path.unlink(missing_ok=True)


def test_from_anndata_uses_zarr_kwargs(get_small_adata):
    chunks = (1, get_small_adata.n_vars)
    artifact = ln.Artifact.from_anndata(
        get_small_adata,
        key="test_kwargs.zarr",
        format="zarr",
        zarr_kwargs={"chunks": chunks},
    )

    local_path = artifact._local_filepath
    assert zarr.open(local_path, mode="r")["X"].chunks == chunks

    shutil.rmtree(local_path)


def test_from_anndata_validate_suffix(get_small_adata):
    artifact = ln.Artifact.from_anndata(get_small_adata, key="test_.h5ad")
    assert artifact.suffix == ".h5ad"
    artifact = ln.Artifact.from_anndata(
        get_small_adata, format="h5ad", key="test_.h5ad"
    )
    assert artifact.suffix == ".h5ad"
    artifact = ln.Artifact.from_anndata(get_small_adata, key="test_.zarr")
    assert artifact.suffix == ".zarr"

    with pytest.raises(ValueError) as error:
        artifact = ln.Artifact.from_anndata(get_small_adata, key="test_.def")
    assert (
        error.exconly().partition(",")[0]
        == "ValueError: Error when specifying AnnData storage format"
    )

    with pytest.raises(InvalidArgument) as error:
        artifact = ln.Artifact.from_anndata(get_small_adata, key="test_")
    assert (
        error.exconly().partition(",")[0]
        == "lamindb.errors.InvalidArgument: The passed key's suffix '' must match the passed path's suffix '.h5ad'."
    )


def test_create_from_mudata(get_small_mdata, mudata_file, adata_file):
    with pytest.raises(ValueError) as error:
        ln.Artifact.from_mudata(adata_file, description="test1")
    assert "data has to be a MuData object or a path to MuData-like" in error.exconly()

    for m in [get_small_mdata, mudata_file]:
        af = ln.Artifact.from_mudata(m, description="test1")
        assert af.description == "test1"
        assert af.key is None
        assert af.otype == "MuData"
        assert af.kind == "dataset"
        if isinstance(m, md.MuData):
            assert af.n_observations == 2


def test_create_from_spatialdata(
    get_small_sdata, spatialdata_file, adata_file, ccaplog
):
    with pytest.raises(ValueError) as error:
        ln.Artifact.from_spatialdata(adata_file, description="test1")
    assert (
        "data has to be a SpatialData object or a path to SpatialData-like"
        in error.exconly()
    )

    for s in [get_small_sdata, spatialdata_file]:
        af = ln.Artifact(s, description="test1")
        assert af.description == "test1"
        assert af.key is None
        assert af.otype == "SpatialData"
        assert af.kind is None
        # n_observations not defined
    assert "data is a SpatialData, please use .from_spatialdata()" in ccaplog.text
    for s in [get_small_sdata, spatialdata_file]:
        af = ln.Artifact.from_spatialdata(s, description="test1")
        assert af.description == "test1"
        assert af.key is None
        assert af.otype == "SpatialData"
        assert af.kind == "dataset"
        # n_observations not defined


@pytest.mark.parametrize(
    "data",
    ["get_small_adata"],
    indirect=True,
)
def test_create_from_anndata_in_storage(data):
    artifact = ln.Artifact.from_anndata(
        data, description="test_create_from_anndata_memory"
    )
    assert artifact.n_observations == data.n_obs
    assert artifact.otype == "AnnData"
    assert hasattr(artifact, "_local_filepath")
    artifact.save()
    # check that the local filepath has been cleared
    assert not hasattr(artifact, "_local_filepath")


# -------------------------------------------------------------------------------------
# Life cycle management
# -------------------------------------------------------------------------------------


def test_revise_recreate_artifact(example_dataframe: pd.DataFrame, ccaplog):
    df = example_dataframe
    # attempt to create a file with an invalid version
    with pytest.raises(ValueError) as error:
        artifact = ln.Artifact.from_dataframe(df, description="test", version=0)
    assert (
        error.exconly()
        == "ValueError: `version` parameter must be `None` or `str`, e.g., '0.1', '1',"
        " '2', etc."
    )

    # create a file and tag it with a version
    key = "my-test-dataset.parquet"
    artifact = ln.Artifact.from_dataframe(df, key=key, description="test", version="1")
    assert artifact.version_tag == "1"
    assert artifact.version == "1"
    assert artifact.uid.endswith("0000")
    assert artifact.path.exists()  # because of cache file already exists
    artifact.save()
    assert artifact.path.exists()
    assert artifact.suffix == ".parquet"

    with pytest.raises(ValueError) as error:
        artifact_v2 = ln.Artifact.from_dataframe(df, revises=artifact, version="1")
    assert (
        error.exconly()
        == "ValueError: Please change the version tag or leave it `None`, '1' is already taken"
    )

    # create new file from old file
    df.iloc[0, 0] = 99  # mutate dataframe so that hash lookup doesn't trigger
    artifact_v2 = ln.Artifact.from_dataframe(df, revises=artifact)
    assert artifact_v2.stem_uid == artifact.stem_uid
    assert artifact_v2.uid.endswith("0001")
    # call this again
    artifact_v2 = ln.Artifact.from_dataframe(df, revises=artifact)
    assert artifact_v2.uid.endswith("0001")
    assert artifact_v2.stem_uid == artifact.stem_uid
    assert artifact_v2.version_tag is None
    assert (
        artifact_v2.version == artifact_v2.uid[-4:]
    )  # version falls back to uid suffix
    assert artifact_v2.key == key
    assert artifact.suffix == ".parquet"
    assert artifact_v2.description == "test"
    assert artifact_v2._revises is not None
    artifact_v2.save()
    assert artifact_v2.path.exists()
    assert artifact_v2._revises is None

    # revise by providing `revises` argument (do not save)
    df.iloc[0, 0] = 0  # mutate dataframe so that hash lookup doesn't trigger
    artifact_v3 = ln.Artifact.from_dataframe(
        df, description="test1", revises=artifact_v2, version="2"
    )
    assert artifact_v3.uid.endswith("0002")
    assert artifact_v3.stem_uid == artifact.stem_uid
    assert artifact_v3.version_tag == "2"
    assert artifact_v3.version == "2"
    assert artifact_v3.description == "test1"
    assert artifact_v3.key == key

    # revise by matching on `key` (do not save)
    artifact_v3 = ln.Artifact.from_dataframe(
        df, key=key, description="test1", version="2"
    )
    assert artifact_v3.uid.endswith("0002")
    assert artifact_v3.stem_uid == artifact.stem_uid
    assert artifact_v3.key == key
    assert artifact_v3.version_tag == "2"
    assert artifact_v3.version == "2"
    assert artifact_v3.description == "test1"
    assert artifact_v3.is_latest
    assert artifact_v2.is_latest
    artifact_v3.save()
    # now r2 is no longer the latest version, but need to re-fresh from db
    artifact_v2.refresh_from_db()
    assert not artifact_v2.is_latest

    # re-create based on hash when artifact_v3 is in trash
    artifact_v3.delete()
    artifact_new = ln.Artifact.from_dataframe(
        df,
        key="my-test-dataset1.parquet",
    )
    assert artifact_new != artifact_v3
    assert artifact_new.hash == artifact_v3.hash
    assert artifact_new.key == "my-test-dataset1.parquet"
    artifact_v3.restore()  # restore from trash

    # re-create based on hash while providing same key, previous version
    df.iloc[0, 0] = 99  # this is a previous version
    artifact_new = ln.Artifact.from_dataframe(
        df,
        key=key,
    )
    assert artifact_new == artifact_v2
    assert artifact_new.hash == artifact_v2.hash
    assert artifact_new.key == key
    assert artifact.is_latest is False

    # re-create based on hash while providing a different key
    df.iloc[0, 0] = 0
    artifact_new = ln.Artifact.from_dataframe(
        df,
        key="my-test-dataset1.parquet",
        description="test1 updated",
    )
    assert artifact_new == artifact_v3
    assert artifact_new.hash == artifact_v3.hash
    assert artifact_new.key == key  # old key
    assert artifact_new.description == "test1 updated"

    # re-create while skipping hash lookup with different key
    artifact_v4 = ln.Artifact.from_dataframe(
        df,
        key="my-test-dataset1.parquet",
        skip_hash_lookup=True,
    )
    assert artifact_v4.uid != artifact_v3.uid
    assert artifact_v4.hash == artifact_v3.hash
    assert artifact_v4.key == "my-test-dataset1.parquet"
    artifact_v4.save()  # this just saves a duplicated file

    # re-create while skipping hash lookup with same key
    artifact_new = ln.Artifact.from_dataframe(
        df,
        key="my-test-dataset1.parquet",
        skip_hash_lookup=True,
    )
    assert artifact_new.uid != artifact_v4.uid
    assert artifact_new.stem_uid == artifact_v4.stem_uid
    assert artifact_new.hash == artifact_v4.hash
    artifact_new.save()  # should now violate unique constraint, falls back artifact_v4
    assert artifact_new.uid == artifact_v4.uid

    # re-create while skipping hash lookup artifact, move to trash before
    artifact_v4.delete()
    artifact_new = ln.Artifact.from_dataframe(
        df,
        key="my-test-dataset1.parquet",
        skip_hash_lookup=True,
    )
    assert artifact_new.uid != artifact_v4.uid
    assert artifact_new.key == "my-test-dataset1.parquet"
    assert "returning artifact from trash" not in ccaplog.text
    artifact_new.save()  # should now violate unique constraint, retrieve artifact_v4 from trash
    assert "returning artifact from trash" in ccaplog.text
    assert artifact_new.uid == artifact_v4.uid
    assert artifact_new.branch_id == 1  # restored to default branch

    with pytest.raises(TypeError) as error:
        ln.Artifact.from_dataframe(
            df, description="test1a", revises=ln.Record(name="test")
        )
    assert error.exconly() == "TypeError: `revises` has to be of type `Artifact`"

    artifact_v3.delete(permanent=True)
    artifact_v2.delete(permanent=True)
    artifact.delete(permanent=True)

    # unversioned file
    artifact = ln.Artifact.from_dataframe(df, description="test2")
    assert artifact.version_tag is None
    assert artifact.version == artifact.uid[-4:]  # version falls back to uid suffix

    # what happens if we don't save the old file?
    # add a test for it!
    artifact.save()

    # create new file from old file
    df.iloc[0, 0] = 101  # mutate dataframe so that hash lookup doesn't trigger
    new_artifact = ln.Artifact.from_dataframe(df, revises=artifact)
    assert artifact.version_tag is None
    assert artifact.version == artifact.uid[-4:]  # version falls back to uid suffix
    assert new_artifact.stem_uid == artifact.stem_uid
    assert new_artifact.version_tag is None
    assert (
        new_artifact.version == new_artifact.uid[-4:]
    )  # version falls back to uid suffix
    assert new_artifact.description == artifact.description

    new_artifact.save()
    assert new_artifact.is_latest

    assert "you are saving to a non-latest version of the artifact" not in ccaplog.text

    old_artifact = ln.Artifact.get(artifact.id)  # to update is_latest from the db
    assert not old_artifact.is_latest
    old_artifact.description = "change old version description"
    old_artifact.save()

    assert "you are saving to a non-latest version of the artifact" in ccaplog.text

    old_artifact.delete()
    new_artifact.delete()

    artifact_from_trash = ln.Artifact.get(new_artifact.uid[:-4])  # query with stem uid
    assert artifact_from_trash.branch_id == -1

    old_artifact.delete(permanent=True)
    new_artifact.delete(permanent=True)
    # check after cleanups
    assert (
        ccaplog.text.count("you are saving to a non-latest version of the artifact")
        == 1
    )


def test_delete_and_restore_artifact(example_dataframe: pd.DataFrame):
    df = example_dataframe
    artifact = ln.Artifact.from_dataframe(
        df, description="My test file to delete"
    ).save()
    assert artifact.branch_id == 1
    assert artifact.key is None or artifact._key_is_virtual
    storage_path = artifact.path
    # trash behavior
    artifact.delete()
    assert storage_path.exists()
    assert artifact.branch_id == -1
    assert ln.Artifact.filter(description="My test file to delete").first() is None
    assert ln.Artifact.filter(
        description="My test file to delete", branch__name="trash"
    ).first()
    # no implicit restore from trash, we're making a new artifact
    artifact_restored = ln.Artifact.from_dataframe(
        df, description="My test file to delete"
    )
    assert artifact_restored.branch_id == 1
    assert artifact_restored != artifact
    # permanent delete
    artifact.delete(permanent=True)
    assert (
        ln.Artifact.filter(description="My test file to delete", branch_id=None).first()
        is None
    )
    assert not storage_path.exists()  # deletes from storage is key_is_virtual


def test_delete_storage():
    with pytest.raises(FileNotFoundError):
        delete_storage(ln.settings.storage.root / "test-delete-storage")


def test_recreate_after_artifact_moved_in_storage(ccaplog):
    # this needs to be in a registered storage location
    Path("./default_storage_unit_core/test_file.txt").write_text("test content")
    artifact = ln.Artifact("./default_storage_unit_core/test_file.txt").save()
    # now rename the file within the storage location
    Path("./default_storage_unit_core/test_file.txt").rename(
        "./default_storage_unit_core/moved_file.txt"
    )
    ln.Artifact("./default_storage_unit_core/moved_file.txt").save()
    assert "updating previous key" in ccaplog.text
    artifact.delete(permanent=True, storage=True)


# -------------------------------------------------------------------------------------
# Storage
# -------------------------------------------------------------------------------------


def test_move_artifact_exception_handling():
    import lamindb.models.artifact as artifact_module

    class FakeFS:
        def __init__(
            self,
            copy_error: Exception | None = None,
            exists: bool = False,
            rm_error: Exception | None = None,
        ):
            self.copy_error = copy_error
            self._exists = exists
            self.rm_error = rm_error
            self.rm_calls = 0

        def exists(self, path: str) -> bool:
            return self._exists

        def copy(self, source: str, target: str, recursive: bool = True):
            if self.copy_error is not None:
                raise self.copy_error

        def rm(self, path: str, recursive: bool = True):
            self.rm_calls += 1
            if self.rm_error is not None:
                raise self.rm_error

    source_path = UPath("s3://lamindb-ci/source-artifact")
    storage = SimpleNamespace(path=UPath("s3://lamindb-ci"), id=42)

    # _rm_catch_error helper branches
    fs_missing = FakeFS(exists=False)
    assert (
        artifact_module._rm_catch_error(fs_missing, "s3://lamindb-ci/missing") is None
    )
    assert fs_missing.rm_calls == 0

    fs_ok = FakeFS(exists=True)
    assert artifact_module._rm_catch_error(fs_ok, "s3://lamindb-ci/target") is None
    assert fs_ok.rm_calls == 1

    rm_error = RuntimeError("rm failed")
    fs_fail = FakeFS(exists=True, rm_error=rm_error)
    returned_error = artifact_module._rm_catch_error(fs_fail, "s3://lamindb-ci/target")
    assert returned_error is rm_error
    assert fs_fail.rm_calls == 1

    # copy branch: copy fails and cleanup helper is included in the message
    artifact_copy = SimpleNamespace(path=source_path, storage_id=None)
    with (
        patch.object(
            artifact_module,
            "_s",
            return_value=SimpleNamespace(
                auto_storage_key_from_artifact=lambda _: "target-artifact"
            ),
        ),
        patch.object(
            artifact_module,
            "fs_for_moving",
            return_value=FakeFS(copy_error=ValueError("copy failed")),
        ),
        patch.object(
            artifact_module,
            "_rm_catch_error",
            return_value=RuntimeError("rm failed"),
        ) as rm_mock,
    ):
        with pytest.raises(RuntimeError, match="Failed to copy artifact"):
            artifact_module._move_artifact_to_storage(artifact_copy, storage)
        assert rm_mock.call_count == 1

    # target exists branch: raises before attempting copy
    artifact_exists = SimpleNamespace(path=source_path, storage_id=None)
    with (
        patch.object(
            artifact_module,
            "_s",
            return_value=SimpleNamespace(
                auto_storage_key_from_artifact=lambda _: "target-artifact"
            ),
        ),
        patch.object(
            artifact_module, "fs_for_moving", return_value=FakeFS(exists=True)
        ),
    ):
        with pytest.raises(FileExistsError, match="already exists"):
            artifact_module._move_artifact_to_storage(artifact_exists, storage)

    # same source and target path is rejected early
    artifact_same_path = SimpleNamespace(path=source_path, storage_id=None)
    with patch.object(
        artifact_module,
        "_s",
        return_value=SimpleNamespace(
            auto_storage_key_from_artifact=lambda _: "source-artifact"
        ),
    ):
        with pytest.raises(ValueError, match="Cannot move to the same path"):
            artifact_module._move_artifact_to_storage(artifact_same_path, storage)

    # verification branch: sorted sizes mismatch triggers cleanup helper
    artifact_mismatch = SimpleNamespace(path=source_path, storage_id=None)
    with (
        patch.object(
            artifact_module,
            "_s",
            return_value=SimpleNamespace(
                auto_storage_key_from_artifact=lambda _: "target-artifact"
            ),
        ),
        patch.object(artifact_module, "fs_for_moving", return_value=FakeFS()),
        patch.object(artifact_module, "_sorted_sizes", side_effect=[[1], [2]]),
        patch.object(
            artifact_module,
            "_rm_catch_error",
            return_value=RuntimeError("rm failed"),
        ) as rm_mock,
    ):
        with pytest.raises(RuntimeError, match="Move verification failed"):
            artifact_module._move_artifact_to_storage(artifact_mismatch, storage)
        assert rm_mock.call_count == 1

    # source-removal branch: move succeeds but rm(source) fails and is logged
    artifact_rm_fail = SimpleNamespace(path=source_path, storage_id=None)
    with (
        patch.object(
            artifact_module,
            "_s",
            return_value=SimpleNamespace(
                auto_storage_key_from_artifact=lambda _: "target-artifact"
            ),
        ),
        patch.object(
            artifact_module,
            "fs_for_moving",
            return_value=FakeFS(rm_error=RuntimeError()),
        ),
        patch.object(artifact_module, "_sorted_sizes", side_effect=[[1], [1]]),
        patch.object(artifact_module.logger, "error") as logger_error_mock,
    ):
        artifact_module._move_artifact_to_storage(artifact_rm_fail, storage)
        assert artifact_rm_fail.storage_id == storage.id
        assert logger_error_mock.call_count == 1


@pytest.mark.parametrize("suffix", [".txt", "", None])
def test_auto_storage_key_from_artifact_uid(suffix):
    test_id = "abo389f"
    if suffix is None:
        with pytest.raises(AssertionError):
            auto_storage_key_from_artifact_uid(test_id, suffix, False)
    else:
        assert AUTO_KEY_PREFIX == ".lamindb/"
        storage_key = auto_storage_key_from_artifact_uid(test_id, suffix, False)
        assert storage_key == f"{AUTO_KEY_PREFIX}{test_id}{suffix}"


def test_storage_root_upath_equivalence():
    storage_root = UPath("s3://lamindb-ci")
    filepath = UPath("s3://lamindb-ci/test-data/Species.csv")
    assert filepath.parents[-1] == storage_root


def test_get_relative_path_to_directory():
    # upath on S3
    upath_root = UPath("s3://lamindb-ci")
    upath_directory1 = UPath("s3://lamindb-ci/test-data")  # no trailing slash
    upath_directory2 = UPath("s3://lamindb-ci/test-data/")  # trailing slash
    upath_file = UPath("s3://lamindb-ci/test-data/test.csv")
    assert (
        "test-data/test.csv"
        == get_relative_path_to_directory(upath_file, upath_root).as_posix()
    )
    assert (
        "test.csv"
        == get_relative_path_to_directory(upath_file, upath_directory1).as_posix()
    )
    assert (
        "test.csv"
        == get_relative_path_to_directory(upath_file, upath_directory2).as_posix()
    )
    # local path
    root = Path("/lamindb-ci")
    upath = Path("/lamindb-ci/test-data/test.csv")
    assert (
        "test-data/test.csv"
        == get_relative_path_to_directory(upath, directory=root).as_posix()
    )
    local_upath_root = UPath(root.as_posix())
    local_upath_file = UPath(upath.as_posix())
    assert (
        "test-data/test.csv"
        == get_relative_path_to_directory(
            local_upath_file, directory=local_upath_root
        ).as_posix()
    )
    with pytest.raises(TypeError) as error:
        get_relative_path_to_directory(upath, directory=".")
    assert error.exconly() == "TypeError: Directory not of type Path or UPath"


def test_check_path_is_child_of_root():
    # str
    root = "s3://lamindb-ci"
    upath = "s3://lamindb-ci/test-data/test.csv"
    assert check_path_is_child_of_root(upath, root=root)
    # str different protocols
    root = "prot1://lamindb-ci"
    upath = "prot2://lamindb-ci/test-data/test.csv"
    assert not check_path_is_child_of_root(upath, root=root)
    # UPath
    root = UPath("s3://lamindb-ci")
    upath = UPath("s3://lamindb-ci/test-data/test.csv")
    assert check_path_is_child_of_root(upath, root=root)
    upath2 = UPath("s3://lamindb-setup/test-data/test.csv")
    assert not check_path_is_child_of_root(upath2, root=root)
    # local path
    root = Path("/lamindb-ci")
    path = Path("/lamindb-ci/test-data/test.csv")
    assert check_path_is_child_of_root(path, root=root)
    path = Path("/lamindb-other/test-data/test.csv")
    assert not check_path_is_child_of_root(path, root=root)
    # Local & UPath
    root = UPath("s3://lamindb-ci")
    path = Path("/lamindb-ci/test-data/test.csv")
    assert not check_path_is_child_of_root(path, root=root)
    # different storage_options
    upath = UPath("s3://lamindb-ci/test-data/test.csv", cache_regions=True)
    assert upath.storage_options != root.storage_options
    assert check_path_is_child_of_root(upath, root=root)
    # the second level
    root = UPath("s3://lamindb-ci/test-data/")
    upath = UPath("s3://lamindb-ci/test-data/test/test.csv")
    assert check_path_is_child_of_root(upath, root=root)
    upath2 = UPath("s3://lamindb-ci/test-data-1/test/test.csv")
    assert not check_path_is_child_of_root(upath2, root=root)
    # http
    assert check_path_is_child_of_root(
        "https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md",
        root="https://raw.githubusercontent.com",
    )
    # s3 with endpoint
    assert not check_path_is_child_of_root(
        "s3://bucket/key?endpoint_url=http://localhost:8000",
        root="s3://bucket/",
    )
    assert not check_path_is_child_of_root(
        "s3://bucket/key/",
        root="s3://bucket/?endpoint_url=http://localhost:8000",
    )
    assert check_path_is_child_of_root(
        "s3://bucket/key?endpoint_url=http://localhost:8000",
        root="s3://bucket?endpoint_url=http://localhost:8000",
    )
    assert check_path_is_child_of_root(
        UPath("s3://bucket/key", endpoint_url="http://localhost:8000"),
        root="s3://bucket?endpoint_url=http://localhost:8000",
    )


def test_serialize_paths():
    fp_str = ln.examples.datasets.anndata_file_pbmc68k_test().as_posix()
    fp_path = Path(fp_str)

    up_str = "s3://lamindb-ci/test-unknown-storage-in-core-tests/test.csv"
    up_upath = UPath(up_str)

    storage = ln.settings.storage.record
    using_key = None

    _, filepath, _, _, _ = process_data(
        "id", fp_str, None, None, storage, using_key, skip_existence_check=True
    )
    assert isinstance(filepath, LocalPathClasses)
    _, filepath, _, _, _ = process_data(
        "id", fp_path, None, None, storage, using_key, skip_existence_check=True
    )
    assert isinstance(filepath, LocalPathClasses)

    with pytest.raises(ln.errors.UnknownStorageLocation) as err:
        _, filepath, _, _, _ = process_data(
            "id",
            up_str,
            None,
            None,
            storage,
            using_key,
            skip_existence_check=True,
        )
    assert f"Path {up_str} is not contained in any known storage" in err.exconly()
    storage = ln.Storage(
        root="s3://lamindb-ci/test-unknown-storage-in-core-tests"
    ).save()
    _, filepath, _, _, _ = process_data(
        "id", up_str, None, None, storage, using_key, skip_existence_check=True
    )
    assert isinstance(filepath, CloudPath)
    _, filepath, _, _, _ = process_data(
        "id",
        up_upath,
        None,
        None,
        storage,
        using_key,
        skip_existence_check=True,
    )
    assert isinstance(filepath, CloudPath)
    storage.delete()
    Path("pbmc68k_test.h5ad").unlink(missing_ok=True)


# -------------------------------------------------------------------------------------
# Data structures in storage
# -------------------------------------------------------------------------------------


def test_data_is_anndata_paths():
    assert data_is_scversedatastructure("something.h5ad", "AnnData")
    assert data_is_scversedatastructure("something.anndata.zarr", "AnnData")
    assert data_is_scversedatastructure(
        "s3://somewhere/something.anndata.zarr", "AnnData"
    )
    assert not data_is_scversedatastructure("s3://somewhere/something.zarr", "AnnData")


def test_data_is_anndata_anndatacessor(get_small_adata):
    artifact = ln.Artifact(get_small_adata, key="test_adata.h5ad").save()

    with artifact.open(mode="r") as access:
        assert data_is_scversedatastructure(access, "AnnData")

    artifact.delete(permanent=True)


def test_data_is_mudata_paths():
    assert data_is_scversedatastructure("something.h5mu", "MuData")
    assert data_is_scversedatastructure("something.mudata.zarr", "MuData")


def test_data_is_spatialdata_paths():
    assert data_is_scversedatastructure("something.spatialdata.zarr", "SpatialData")


@pytest.mark.parametrize(
    "data,data_type,expected",
    [
        ("get_small_adata", "AnnData", True),
        ("get_small_mdata", "MuData", True),
        ("get_small_sdata", "SpatialData", True),
        ("get_small_adata", "MuData", False),
        ("get_small_mdata", "AnnData", False),
        ("get_small_sdata", "AnnData", False),
        ("get_small_adata", None, True),
        (pd.DataFrame(), "AnnData", False),
        (None, "AnnData", False),
        (None, None, False),
    ],
)
def test_data_is_scversedatastructure(request, data, data_type, expected):
    if isinstance(data, str) and data.startswith("get_small_"):
        data = request.getfixturevalue(data)

    assert data_is_scversedatastructure(data, data_type) == expected


# -------------------------------------------------------------------------------------
# Miscellaneous
# -------------------------------------------------------------------------------------


def test_load_to_memory(tsv_file, zip_file, fcs_file, yaml_file):
    # tsv
    df = load_tsv(tsv_file)
    assert isinstance(df, pd.DataFrame)
    # fcs
    adata = load_fcs(str(fcs_file))
    assert isinstance(adata, ad.AnnData)
    # error
    with pytest.raises(NotImplementedError):
        load_to_memory(zip_file)
    # check that it is a path
    assert isinstance(load_to_memory("./somefile.rds"), UPath)
    # yaml
    dct = load_to_memory(yaml_file)
    assert dct["a"] == 1
    assert dct["b"] == 2

    with pytest.raises(TypeError) as error:
        ln.Artifact(True)
    assert error.exconly() == "TypeError: data has to be a string, Path, UPath"


def test_bulk_delete():
    report_path = Path("report.html")
    report_path.write_text("a")
    environment_path = Path("environment.txt")
    environment_path.write_text("c")
    report = ln.Artifact(report_path, description="Report").save()
    report_path.unlink()
    report_path = report.path
    environment = ln.Artifact(environment_path, description="requirement.txt").save()
    environment_path.unlink()
    environment_path = environment.path

    ln.Artifact.filter(id__in=[environment.id, report.id]).delete()

    assert len(ln.Artifact.filter(id__in=[environment.id, report.id], branch_id=1)) == 0

    # the 2 artifacts are in trash now
    assert (
        len(
            ln.Artifact.filter(
                id__in=[environment.id, report.id],
                branch_id=-1,
            )
        )
        == 2
    )

    ln.Artifact.filter(id__in=[environment.id, report.id], branch_id=-1).delete(
        permanent=True
    )
    # now they're gone
    assert (
        len(
            ln.Artifact.filter(
                id__in=[environment.id, report.id],
                branch_id=None,
            )
        )
        == 0
    )

    assert not report_path.exists()
    assert not environment_path.exists()


@pytest.mark.parametrize("module_name", ["mudata", "spatialdata"])
def test_no_unnecessary_imports(
    example_dataframe: pd.DataFrame, module_name: str
) -> None:
    if module_name in sys.modules:
        del sys.modules[module_name]

    af = ln.Artifact.from_dataframe(example_dataframe, description="to delete").save()

    loaded_packages = []
    for name, module in sys.modules.items():
        if isinstance(module, ModuleType) and not name.startswith("_"):
            if "." not in name:
                loaded_packages.append(name)

    assert module_name not in sorted(loaded_packages)

    # Cleanup and restore imports to ensure that other tests still run smoothly
    af.delete(permanent=True)
    import mudata  # noqa
    import spatialdata  # noqa


def test_artifact_get_tracking(example_dataframe: pd.DataFrame):
    artifact = ln.Artifact.from_dataframe(example_dataframe, key="df.parquet").save()

    transform = ln.Transform(key="test track artifact via get").save()
    run = ln.Run(transform).save()

    assert (
        ln.Artifact.get(key="df.parquet", is_run_input=run) in run.input_artifacts.all()
    )

    artifact.delete(permanent=True)
    transform.delete(permanent=True)


def test_get_by_path(example_dataframe: pd.DataFrame):
    artifact = ln.Artifact.from_dataframe(example_dataframe, key="df.parquet").save()
    artifact_path = artifact.path

    assert ln.Artifact.get(path=artifact_path) == artifact
    assert ln.Artifact.filter().get(path=artifact_path.as_posix()) == artifact

    with pytest.raises(ln.errors.ObjectDoesNotExist):
        ln.Artifact.get(path="s3://bucket/folder/file.parquet")

    with pytest.raises(ValueError):
        ln.User.get(path="some/path")

    artifact.delete(permanent=True)

    path_str = "s3://lamindb-ci/test-data/test.csv"
    storage = ln.Storage(ln.UPath(path_str).parent).save()

    artifact = ln.Artifact(path_str, description="test get by path").save()
    assert not artifact._key_is_virtual
    assert artifact._real_key is None
    assert ln.Artifact.get(path=path_str) == artifact

    artifact.delete(permanent=True, storage=False)

    artifact = ln.Artifact(path_str, key="some_file.csv").save()
    assert artifact._key_is_virtual
    assert artifact._real_key.endswith("test.csv")
    assert ln.Artifact.get(path=path_str) == artifact

    artifact.delete(permanent=True, storage=False)

    storage.delete()


def test_update_suffix_for_registered_storage_with_real_key(
    registered_storage_file_and_folder,
):
    test_filepath, folder_path = registered_storage_file_and_folder
    assert folder_path.exists() and folder_path.is_dir()

    artifact = ln.Artifact(test_filepath, key="my_file.csv").save()
    assert artifact._real_key is not None
    assert artifact.path.suffix == ".csv"

    source_path = artifact.path
    artifact.suffix = ".tsv"
    with patch("builtins.input", return_value="y"):
        artifact.save()

    target_path = artifact.path
    assert artifact.suffix == ".tsv"
    assert artifact.key is not None
    assert artifact.key.endswith(".tsv")
    assert artifact._real_key is not None
    assert artifact._real_key.endswith(".tsv")
    assert target_path.suffix == ".tsv"
    assert target_path.exists()
    assert not source_path.exists()

    artifact.delete(permanent=True, storage=False)


def test_update_suffix_for_registered_storage_folder_artifact(
    registered_storage_file_and_folder,
):
    _, folder_path = registered_storage_file_and_folder
    artifact = ln.Artifact(folder_path, key="dataset").save()

    assert artifact._real_key is not None
    assert artifact.suffix == ""
    assert artifact.path.exists()
    assert artifact.path.is_dir()

    source_path = artifact.path
    artifact.suffix = ".zarr"
    with patch("builtins.input", return_value="y"):
        artifact.save()

    target_path = artifact.path
    assert artifact.suffix == ".zarr"
    assert artifact.key is not None
    assert artifact.key.endswith(".zarr")
    assert artifact._real_key is not None
    assert artifact._real_key.endswith(".zarr")
    assert target_path.exists()
    assert target_path.is_dir()
    assert target_path.suffix == ".zarr"
    assert not source_path.exists()

    artifact.delete(permanent=True, storage=False)


def test_update_non_virtual_key_for_registered_storage_file(
    registered_storage_file_and_folder,
):
    test_filepath, _ = registered_storage_file_and_folder
    artifact = ln.Artifact(test_filepath).save()
    assert not artifact._key_is_virtual
    assert artifact._real_key is None
    assert artifact.key is not None

    source_path = artifact.path
    source_key = artifact.key
    target_key = (
        PurePosixPath(source_key)
        .with_name("suffix_fixture_file_renamed.csv")
        .as_posix()
    )
    artifact.key = target_key
    with patch("builtins.input", return_value="n"):
        assert artifact.save() is None
    assert source_path.exists()

    artifact = ln.Artifact.get(uid=artifact.uid)
    assert artifact.key == source_key
    artifact.key = target_key
    with patch("builtins.input", return_value="y"):
        artifact.save()

    target_path = artifact.path
    assert artifact.key == target_key
    assert target_path.exists()
    assert not source_path.exists()

    artifact.delete(permanent=True, storage=False)


def test_update_non_virtual_key_for_registered_storage_file_invalid_suffix(
    registered_storage_file_and_folder,
):
    test_filepath, _ = registered_storage_file_and_folder
    artifact = ln.Artifact(test_filepath).save()
    assert artifact.key is not None

    artifact.key = PurePosixPath(artifact.key).with_suffix(".tsv").as_posix()
    with pytest.raises(InvalidArgument) as error:
        artifact.save()
    assert (
        error.exconly()
        == "lamindb.errors.InvalidArgument: The suffix '.tsv' of the provided key is incorrect, it should be '.csv'."
    )

    artifact.delete(permanent=True, storage=False)


def test_update_key_to_none_raises_invalid_argument(
    registered_storage_file_and_folder,
):
    test_filepath, _ = registered_storage_file_and_folder
    artifact = ln.Artifact(test_filepath).save()
    artifact.key = None

    with pytest.raises(InvalidArgument) as error:
        artifact.save()
    assert (
        error.exconly()
        == "lamindb.errors.InvalidArgument: Cannot update an artifact key to None."
    )

    artifact.delete(permanent=True, storage=False)


def test_update_non_virtual_key_before_save_raises_invalid_argument(tsv_file):
    artifact = ln.Artifact(tsv_file, key="before-save.tsv", _key_is_virtual=False)
    artifact.key = "after-edit.tsv"

    with pytest.raises(InvalidArgument) as error:
        artifact.save()
    assert (
        error.exconly()
        == "lamindb.errors.InvalidArgument: Cannot update the key of an artifact before it is saved."
    )


def test_update_non_virtual_key_in_unmanaged_storage_raises_invalid_argument():
    url = (
        "https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md"
    )
    artifact = ln.Artifact(url, description="test unmanaged key update").save()
    assert not artifact._key_is_virtual
    artifact.key = "laminlabs/lamindb/refs/heads/main/README-renamed.md"
    with pytest.raises(InvalidArgument) as error:
        artifact.save()
    assert (
        error.exconly()
        == "lamindb.errors.InvalidArgument: Cannot update a non-virtual key of an artifact in a storage location that is not managed by the current instance."
    )

    artifact.delete(permanent=True, storage=False)


def test_create_artifact_in_foreign_managed_storage_raises_value_error(tsv_file):
    storage = ln.settings.storage.record
    with (
        patch.object(storage, "instance_uid", "_not_exists_"),
        pytest.raises(
            ValueError,
            match=(
                "Cannot create an artifact in a storage location that is not managed by the current instance."
            ),
        ),
    ):
        ln.Artifact(tsv_file, storage=storage)


def test_save_url_with_virtual_key_and_unmanaged_suffix_update_error():
    url = (
        "https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md"
    )
    key = "folder/file.md"
    artifact = ln.Artifact(url, key=key).save()

    assert artifact._real_key == "laminlabs/lamindb/refs/heads/main/README.md"
    assert artifact.storage.instance_uid is None

    cache_path_str = artifact._cache_path.as_posix()
    assert not cache_path_str.startswith("http")
    assert cache_path_str.endswith(key)

    artifact.suffix = ".txt"
    with pytest.raises(
        InvalidArgument,
        match=(
            "Cannot update the suffix of an artifact in a storage location "
            "that is not managed by the current instance."
        ),
    ):
        artifact.save()

    artifact.delete(permanent=True, storage=False)


def test_change_space_for_artifact_in_foreign_managed_storage_raises_value_error(
    tsv_file,
):
    artifact = ln.Artifact(tsv_file, key="space-change-foreign-storage.tsv").save()
    space = ln.Space(
        name="test space change in foreign storage", uid="foreignspace"
    ).save()
    artifact.space = space
    with (
        patch.object(artifact.storage, "instance_uid", "_not_exists_"),
        pytest.raises(
            ValueError,
            match=(
                "Cannot change the space of an artifact in a storage location that is not managed by the current instance."
            ),
        ),
    ):
        artifact.save()

    artifact.delete(permanent=True)
    space.delete(permanent=True)


def test_save_artifact_to_foreign_managed_storage_raises_value_error(tsv_file):
    artifact = ln.Artifact(tsv_file, key="save-foreign-storage.tsv")
    with (
        patch.object(artifact.storage, "instance_uid", "_not_exists_"),
        pytest.raises(
            ValueError,
            match=(
                "Cannot save an artifact to a storage location that is not managed by the current instance."
            ),
        ),
    ):
        artifact.save()


def test_artifact_space_change(tsv_file):
    artifact = ln.Artifact(tsv_file, key="test_space_change.tsv").save()
    space = ln.Space(name="test space change", uid="00000234").save()
    # test after saving
    artifact.space = space
    with pytest.raises(ValueError) as err:
        artifact.save()
    assert (
        "No local storage locations managed by the current instance found for the space"
        in err.exconly()
    )
    # test after getting from the db
    artifact = ln.Artifact.get(key="test_space_change.tsv")
    artifact.space = space
    with pytest.raises(ValueError) as err:
        artifact.save()
    assert (
        "No local storage locations managed by the current instance found for the space"
        in err.exconly()
    )

    artifact.delete(permanent=True)
    space.delete(permanent=True)


def test_passing_foreign_keys_ids(tsv_file):
    transform = ln.Transform(key="test passings foreign keys ids").save()
    first_run = ln.Run(transform).save()
    second_run = ln.Run(transform).save()

    # check that passing a wrong type errors
    with pytest.raises(AssertionError):
        ln.Artifact(tsv_file, space=transform)

    with pytest.raises(ValueError) as err:
        ln.Artifact(tsv_file, run=first_run, run_id=first_run.id)
    assert "Do not pass both Run and its id at the same time." in err.exconly()

    artifact = ln.Artifact(tsv_file, run=first_run, key="test_fk.tsv").save()
    artifact_id = artifact.id
    assert artifact.run == first_run

    artifact = ln.Artifact(tsv_file, run_id=second_run.id)  # same hash
    assert artifact.id == artifact_id
    assert artifact._subsequent_run_id == second_run.id
    assert second_run in artifact.recreating_runs.all()

    # Run-side: output_artifacts vs recreated_artifacts
    assert list(first_run.output_artifacts.all()) == [artifact]
    assert list(first_run.recreated_artifacts.all()) == []
    assert list(second_run.output_artifacts.all()) == []
    assert list(second_run.recreated_artifacts.all()) == [artifact]

    # query_output_artifacts
    assert list(first_run.query_output_artifacts(include_recreated=False)) == [artifact]
    assert list(first_run.query_output_artifacts(include_recreated=True)) == [artifact]
    assert list(second_run.query_output_artifacts(include_recreated=False)) == []
    assert list(second_run.query_output_artifacts(include_recreated=True)) == [artifact]

    artifact.delete(permanent=True)
    second_run.delete(permanent=True)
    first_run.delete(permanent=True)
    transform.delete(permanent=True)


================================================
FILE: tests/core/test_artifact_dataframe_with_curation.py
================================================
# Note: Almost all logic for schema-based validation is handled in the curators test suite
# This here only covers external feature annotation and validation

import lamindb as ln
import pandas as pd
import pytest


@pytest.fixture(scope="module")
def two_internal_features():
    feat1 = ln.Feature(name="feat1", dtype=int).save()
    feat2 = ln.Feature(name="feat2", dtype=int).save()
    yield feat1, feat2
    feat1.delete(permanent=True)
    feat2.delete(permanent=True)


@pytest.fixture(scope="module")
def two_external_features():
    feature_a = ln.Feature(name="feature_a", dtype=str).save()
    feature_b = ln.Feature(name="feature_b", dtype=str).save()
    yield feature_a, feature_b
    feature_a.delete(permanent=True)
    feature_b.delete(permanent=True)


@pytest.mark.parametrize("use_schema", [True, False])
def test_create_artifact_with_external_feature_annotations(
    use_schema: bool,
    two_external_features: tuple[ln.Feature, ln.Feature],
):
    feat1, feat2 = two_external_features
    if use_schema:
        schema = ln.Schema(features=[feat1, feat2]).save()
    else:
        schema = None
    artifact = ln.Artifact(
        ".gitignore",
        key="test_file",
        features={"feature_a": "x", "feature_b": "y"},
        schema=schema,
    ).save()
    assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"}
    assert artifact.schema == schema
    # repeat to check idempotency (requires set_values() instead of add_values())
    artifact = ln.Artifact(
        ".gitignore",
        key="test_file",
        features={"feature_a": "x", "feature_b": "y"},
        schema=schema,
    ).save()
    assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"}
    assert artifact.schema == schema
    if use_schema:
        with pytest.raises(ValueError) as error:
            artifact.features.remove_values("feature_a", value="x")
        assert (
            "Cannot remove values if artifact has external schema." in error.exconly()
        )
    else:
        artifact.features.remove_values("feature_a", value="x")
        assert artifact.features.get_values() == {"feature_b": "y"}
    artifact.delete(permanent=True)
    if use_schema:
        schema.delete(permanent=True)


def test_artifact_from_dataframe_with_schema(example_dataframe: pd.DataFrame):
    df = example_dataframe
    feat1 = ln.Feature(name="feat1", dtype=int).save()
    artifact = ln.Artifact.from_dataframe(
        df, key="test_df.parquet", schema="valid_features"
    ).save()
    # repeat to check idempotency
    artifact = ln.Artifact.from_dataframe(
        df, key="test_df.parquet", schema="valid_features"
    ).save()
    assert artifact.schema == ln.examples.schemas.valid_features()
    assert artifact.features.get_values() == {}
    assert (
        artifact.features.describe(return_str=True)
        == """\
Artifact: test_df.parquet (0000)
└── Dataset features
    └── columns (1)
        feat1               int"""
    )
    inferred_schema_link = artifact.schemas.through.get(artifact_id=artifact.id)
    assert inferred_schema_link.slot == "columns"
    assert inferred_schema_link.schema.members.count() == 1
    assert inferred_schema_link.schema.members.first() == feat1
    inferred_schema = inferred_schema_link.schema
    inferred_schema_link.delete()
    inferred_schema.delete(permanent=True)
    feat1.delete(permanent=True)
    artifact.delete(permanent=True)


def test_artifact_dataframe_with_features(example_dataframe: pd.DataFrame):
    """Test column names encoding when features with the same names are present."""
    artifact = ln.Artifact.from_dataframe(example_dataframe, key="df.parquet").save()
    id_feature = ln.Feature(name="id", dtype=int).save()
    uid_feature = ln.Feature(name="uid", dtype=str).save()
    artifact.features.add_values({"id": 1, "uid": "test-uid"})
    df = ln.Artifact.filter(key="df.parquet").to_dataframe(
        include=["description"], features=True
    )
    assert df.index.name == "__lamindb_artifact_id__"
    assert df.columns.tolist() == [
        "__lamindb_artifact_uid__",
        "key",
        "id",
        "uid",
        "description",
    ]
    assert df.iloc[0]["id"] == 1
    assert df.iloc[0]["uid"] == "test-uid"

    artifact.delete(permanent=True)
    id_feature.delete(permanent=True)
    uid_feature.delete(permanent=True)


def test_from_dataframe_with_external_schema(
    example_dataframe: pd.DataFrame,
    two_external_features: tuple[ln.Feature, ln.Feature],
    two_internal_features: tuple[ln.Feature, ln.Feature],
):
    df = example_dataframe
    feat1, feat2 = two_internal_features
    featA, featB = two_external_features
    schema_external = ln.Schema(features=[featA, featB]).save()

    # Case 1: wrong internal features for this dataframe
    schema_with_mistake = ln.Schema(
        features=[featA, featB],
        slots={"__external__": schema_external},
        otype="DataFrame",
    ).save()
    with pytest.raises(ln.errors.ValidationError) as error:
        artifact = ln.Artifact.from_dataframe(
            df,
            key="test_df_with_external_features.parquet",
            features={"feature_a": "x", "feature_b": "y"},
            schema=schema_with_mistake,
        ).save()
    assert "COLUMN_NOT_IN_DATAFRAME" in error.exconly()

    # alternative via DataFrameCurator directly
    with pytest.raises(ln.errors.ValidationError) as error:
        ln.curators.DataFrameCurator(
            df,
            schema=schema_with_mistake,
        ).validate()
    assert "COLUMN_NOT_IN_DATAFRAME" in error.exconly()

    # Case 2: no schema for external features provided
    schema_no_external = ln.Schema(features=[feat1, feat2]).save()
    artifact = ln.Artifact.from_dataframe(
        df,
        key="test_df_with_external_features.parquet",
        features={"feature_a": "x", "feature_b": "y"},
        schema=schema_no_external,
    ).save()
    assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"}
    artifact.delete(permanent=True)

    # alternative via DataFrameCurator directly
    curator = ln.curators.DataFrameCurator(
        df,
        schema=schema_no_external,
        features={"feature_a": "x", "feature_b": "y"},
    )
    artifact = curator.save_artifact(
        key="test_df_with_external_features.parquet",
    ).save()
    assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"}
    artifact.delete(permanent=True)

    # Case 3: correct external schema
    schema_correct_external = ln.Schema(
        features=[feat1, feat2],
        slots={"__external__": schema_external},
        otype="DataFrame",
    ).save()

    # Case 3a: user passes no external features
    with pytest.raises(ln.errors.ValidationError) as error:
        artifact = ln.Artifact.from_dataframe(
            df,
            key="test_df_with_external_features.parquet",
            schema=schema_correct_external,
        ).save()
    assert (
        "External features slot is defined in schema but no external features were provided."
        in error.exconly()
    )

    # alternative via DataFrameCurator directly
    with pytest.raises(ln.errors.ValidationError) as error:
        curator = ln.curators.DataFrameCurator(
            df,
            schema=schema_correct_external,
        )
        artifact = curator.save_artifact(
            key="test_df_with_external_features.parquet",
        ).save()
    assert (
        "External features slot is defined in schema but no external features were provided."
        in error.exconly()
    )

    # Case 3b: user provides external features
    artifact = ln.Artifact.from_dataframe(
        df,
        key="test_df_with_external_features.parquet",
        features={"feature_a": "x", "feature_b": "y"},
        schema=schema_correct_external,
    ).save()
    assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"}
    assert (
        artifact.features.describe(return_str=True)
        == """\
Artifact: test_df_with_external_features.parquet (0000)
├── Dataset features
│   └── columns (2)
│       feat1               int
│       feat2               int
└── External features
    └── feature_a           str                      x
        feature_b           str                      y"""
    )
    with pytest.raises(ValueError) as error:
        artifact.features.remove_values("feature_a", value="x")
    assert "Cannot remove values if artifact has external schema." in error.exconly()
    artifact.delete(permanent=True)

    # alternative via DataFrameCurator directly
    curator = ln.curators.DataFrameCurator(
        df,
        schema=schema_correct_external,
        features={"feature_a": "x", "feature_b": "y"},
    )
    artifact = curator.save_artifact(
        key="test_df_with_external_features.parquet",
    ).save()
    assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"}

    # call this again to check calling with an existing artifact
    curator = ln.curators.DataFrameCurator(
        artifact,
        schema=schema_correct_external,
        features={"feature_a": "z", "feature_b": "y"},
    )
    artifact = curator.save_artifact(
        key="test_df_with_external_features.parquet",
    ).save()
    assert artifact.features.get_values() == {"feature_a": "z", "feature_b": "y"}

    # call this again without passing features explicitly (they're already part of the artifact)
    curator = ln.curators.DataFrameCurator(
        artifact,
        schema=schema_correct_external,
    )
    artifact = curator.save_artifact(
        key="test_df_with_external_features.parquet",
    ).save()
    assert artifact.features.get_values() == {"feature_a": "z", "feature_b": "y"}

    # clean up everything
    inferred_schema = artifact.schemas.all()[0]
    artifact.schemas.remove(inferred_schema.id)
    inferred_schema.delete(permanent=True)
    artifact.delete(permanent=True)
    schema_with_mistake.delete(permanent=True)
    schema_no_external.delete(permanent=True)
    schema_correct_external.delete(permanent=True)
    schema_external.delete(permanent=True)


================================================
FILE: tests/core/test_artifact_describe_to_dataframe.py
================================================
from datetime import date

import bionty as bt
import lamindb as ln
import numpy as np
import pandas as pd
import pytest
from lamindb.models._describe import describe_postgres, describe_sqlite


def _check_df_equality(actual_df: pd.DataFrame, expected_df: pd.DataFrame) -> bool:
    """Checks equality between two DataFrames.

    Special handling for columns containing sets and NaN values.
    """
    # do not test indices by default
    # pd.testing.assert_index_equal(actual_df.index, expected_df.index)
    expected_df.index = actual_df.index
    assert set(actual_df.columns) == set(expected_df.columns)
    for col in expected_df.columns:
        # Detect if column contains sets by checking first non-null value
        first_value = next((v for v in expected_df[col] if pd.notna(v)), None)
        is_set_column = isinstance(first_value, set)
        if is_set_column:
            # For set columns, compare sets with NaN handling
            for idx in expected_df.index:
                actual_val = actual_df.loc[idx, col]
                expected_val = expected_df.loc[idx, col]
                # If both are NaN, they're equal
                if pd.isna(actual_val) and pd.isna(expected_val):
                    continue
                # If one is NaN and the other isn't, they're not equal
                if pd.isna(actual_val) != pd.isna(expected_val):
                    raise AssertionError(f"NaN mismatch at index {idx} in column {col}")
                # If neither is NaN, compare the sets
                assert actual_val == expected_val, (
                    f"Set mismatch at index {idx} in column {col}"
                )
        else:
            pd.testing.assert_series_equal(
                actual_df[col],
                expected_df[col],
                check_names=False,  # ignore series names
            )
    return True


# parallels the `registries` guide
# please also see the test_querset.py tests
def test_describe_to_dataframe_example_dataset():
    ln.examples.datasets.mini_immuno.save_mini_immuno_datasets()
    artifact = ln.Artifact.get(key="examples/dataset1.h5ad")
    artifact2 = ln.Artifact.get(key="examples/dataset2.h5ad")

    with pytest.raises(ValueError) as error:
        artifact.features.remove_values("cell_type_by_expert")
    assert "Cannot remove values for dataset features." in error.exconly()

    # Test df(include=[...])
    df = (
        ln.Artifact.filter(key__startswith="examples/dataset", suffix=".h5ad")
        .order_by("-key")
        .to_dataframe(include=["schemas__hash", "schemas__name"])
        .drop(["uid"], axis=1)
    )
    expected_data = {
        "key": ["examples/dataset2.h5ad", "examples/dataset1.h5ad"],
        "schemas__hash": [
            set(artifact2.schemas.all().values_list("hash", flat=True)),
            set(artifact.schemas.all().values_list("hash", flat=True)),
        ],
        "schemas__name": [{None}, {None}],
    }
    expected_df = pd.DataFrame(expected_data)
    _check_df_equality(df, expected_df)

    # Test df with features
    # test that the records filter DOES NOT affect joining the annotations
    # we want it to only affect the artifact query (even though here, it won't change the result as both artifacts have the IFNG label)
    df = (
        ln.Artifact.filter(
            key__startswith="examples/dataset",
            suffix=".h5ad",
            records__name="IFNG",
        )
        .order_by("-key")
        .to_dataframe(
            features=[
                "cell_type_by_expert",
                "cell_type_by_model",
                "experiment",
                "perturbation",
                "temperature",
                "study_note",
                "date_of_study",
            ]
        )
        .drop(["uid"], axis=1)
    )
    expected_data = {
        "key": ["examples/dataset2.h5ad", "examples/dataset1.h5ad"],
        "cell_type_by_expert": [np.nan, {"CD8-positive, alpha-beta T cell", "B cell"}],
        "cell_type_by_model": [{"T cell", "B cell"}, {"T cell", "B cell"}],
        "experiment": pd.Categorical(["Experiment 2", "Experiment 1"]),
        "perturbation": [{"IFNG", "DMSO"}, {"IFNG", "DMSO"}],
        "temperature": [22.6, 21.6],
        "study_note": [
            np.nan,
            "We had a great time performing this study and the results look compelling.",
        ],
        "date_of_study": [date(2025, 2, 13), date(2024, 12, 1)],
        "study_metadata": [
            {"detail1": "456", "detail2": 2},
            {"detail1": "123", "detail2": 1},
        ],
    }
    expected_df = pd.DataFrame(expected_data)
    _check_df_equality(df, expected_df)

    # Test filtering artifacts by schemas__in (alternative approach)
    # Query artifacts that measure CD8A gene by filtering schemas first
    cd8a = bt.Gene.get(symbol="CD8A")
    schemas_with_cd8a = ln.Schema.filter(genes=cd8a)
    df = ln.Artifact.filter(schemas__in=schemas_with_cd8a).to_dataframe()
    assert set(df["key"]) == {"examples/dataset2.h5ad", "examples/dataset1.h5ad"}
    # check backward compat query with deprecation warning
    with pytest.warns(
        DeprecationWarning, match="Querying Artifact by `feature_sets` is deprecated"
    ):
        df = ln.Artifact.filter(feature_sets__in=schemas_with_cd8a).to_dataframe()
    assert set(df["key"]) == {"examples/dataset2.h5ad", "examples/dataset1.h5ad"}

    # expected output has italicized elements that can't be tested
    # hence testing is restricted to section content, not headings
    output = artifact.describe(return_str=True)
    assert "hash:" in output
    assert "size:" in output
    assert "schema:" in output
    assert "n_observations: 3" in output
    assert "storage/path:" in output
    assert "created_by:" in output
    assert "created_at:" in output

    # dataset section
    assert (
        artifact.features.describe(return_str=True)
        == """Artifact: examples/dataset1.h5ad (0000)
├── Dataset features
│   ├── obs (4)
│   │   cell_type_by_expe…  bionty.CellType          B cell, CD8-positive, alph…
│   │   cell_type_by_model  bionty.CellType          B cell, T cell
│   │   perturbation        Record                   DMSO, IFNG
│   │   sample_note         str
│   └── var.T (3 bionty.G…
│       CD14                num
│       CD4                 num
│       CD8A                num
└── External features
    └── experiment          Record                   Experiment 1
        date_of_study       date                     2024-12-01
        study_metadata      dict                     {'detail1': '123', 'detail…
        study_note          str                      We had a great time perfor…
        temperature         float                    21.6"""
    )

    # labels section
    if ln.setup.settings.instance.dialect == "postgresql":
        description_tree = describe_postgres(artifact)
    else:
        description_tree = describe_sqlite(artifact)
    labels_node = description_tree.children[-1].label
    assert labels_node.label.plain == "Labels"
    assert len(labels_node.children[0].label.columns) == 3
    assert len(labels_node.children[0].label.rows) == 2
    assert labels_node.children[0].label.columns[0]._cells == [
        ".records",
        ".cell_types",
    ]
    assert labels_node.children[0].label.columns[1]._cells[0].plain == "Record"
    assert labels_node.children[0].label.columns[1]._cells[1].plain == "bionty.CellType"
    assert {
        c.strip()
        for c in ",".join(labels_node.children[0].label.columns[2]._cells).split(",")
    } == {
        "DMSO",
        "IFNG",
        "Experiment 1",
        "B cell",
        "T cell",
        "CD8-positive",
        "alpha-beta T cell",
    }

    # set_values should only replace external features, not dataset-derived features
    values_before = artifact.features.get_values()
    adata = artifact.load()
    just_internal = {
        col: values_before[col] for col in adata.obs.columns if col in values_before
    }
    artifact.features.set_values({"temperature": 99.0})
    values_after_set = artifact.features.get_values()
    assert {col: values_after_set[col] for col in just_internal} == just_internal
    assert values_after_set["temperature"] == 99.0
    assert set(values_after_set.keys()) == set(just_internal) | {"temperature"}

    # test that only external feature are removed upon artifact.features.remove_values()
    alljson_values = artifact.features.get_values()
    artifact.features.remove_values()
    assert just_internal != alljson_values
    assert just_internal == artifact.features.get_values()

    artifact.delete(permanent=True)
    artifact2.delete(permanent=True)
    ln.Schema.get(name="anndata_ensembl_gene_ids_and_valid_features_in_obs").delete(
        permanent=True
    )
    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)
    bt.Gene.filter().delete(permanent=True)
    ln.Record.filter().delete(permanent=True)
    bt.CellType.filter().delete(permanent=True)


================================================
FILE: tests/core/test_artifact_features_annotations.py
================================================
# ruff: noqa: F811

from datetime import date, datetime

import bionty as bt
import lamindb as ln
import pytest
from lamindb.examples.datasets import mini_immuno
from lamindb.models.query_set import BasicQuerySet, SQLRecordList


# see test_record_basics.py for similar test for records (populate and query by features)
def test_artifact_features_add_remove_query():
    record_type1 = ln.Record(name="RecordType1", is_type=True).save()
    record_entity1 = ln.Record(name="entity1", type=record_type1).save()
    record_entity2 = ln.Record(name="entity2", type=record_type1).save()
    ulabel = ln.ULabel(name="test-ulabel").save()
    artifact = ln.Artifact(".gitignore", key="test-artifact").save()
    transform = ln.Transform(key="test-transform").save()
    run = ln.Run(transform, name="test-run").save()

    feature_str = ln.Feature(name="feature_str", dtype=str).save()
    feature_list_str = ln.Feature(name="feature_list_str", dtype=list[str]).save()
    feature_int = ln.Feature(name="feature_int", dtype=int).save()
    feature_float = ln.Feature(name="feature_float", dtype=float).save()
    feature_num = ln.Feature(name="feature_num", dtype="num").save()
    feature_datetime = ln.Feature(name="feature_datetime", dtype=datetime).save()
    feature_date = ln.Feature(
        name="feature_date", dtype=datetime.date, coerce=True
    ).save()
    feature_dict = ln.Feature(name="feature_dict", dtype=dict).save()
    feature_type1 = ln.Feature(name="feature_type1", dtype=record_type1).save()
    feature_type1s = ln.Feature(name="feature_type1s", dtype=list[record_type1]).save()
    feature_ulabel = ln.Feature(name="feature_ulabel", dtype=ln.ULabel).save()
    feature_user = ln.Feature(name="feature_user", dtype=ln.User).save()
    feature_project = ln.Feature(name="feature_project", dtype=ln.Project).save()
    feature_artifact = ln.Feature(name="feature_artifact", dtype=ln.Artifact).save()
    feature_artifact_2 = ln.Feature(name="feature_artifact_2", dtype=ln.Artifact).save()
    feature_run = ln.Feature(name="feature_run", dtype=ln.Run.uid).save()
    feature_cell_line = ln.Feature(name="feature_cell_line", dtype=bt.CellLine).save()
    ln.Feature(name="feature_cell_line_pass_list", dtype=bt.CellLine).save()
    feature_cell_lines = ln.Feature(
        name="feature_cell_lines", dtype=list[bt.CellLine]
    ).save()
    feature_cl_ontology_id = ln.Feature(
        name="feature_cl_ontology_id", dtype=bt.CellLine.ontology_id
    ).save()
    feature_gene_ontology_id = ln.Feature(
        name="feature_gene_ontology_id", dtype=bt.Gene.ensembl_gene_id
    ).save()

    test_artifact = ln.Artifact(".gitignore", key="test_artifact").save()
    value_artifact = ln.Artifact("pyproject.toml", key="value_artifact.toml").save()
    test_project = ln.Project(name="test_project").save()
    hek293 = bt.CellLine.from_source(name="HEK293").save()
    a549 = bt.CellLine.from_source(name="A-549").save()
    gene1 = bt.Gene.from_source(ensembl_gene_id="ENSG00000139618").save()
    gene2 = bt.Gene.from_source(ensembl_gene_id="ENSG00000141510").save()

    # no schema validation

    test_values = {
        "feature_str": "a string value",
        "feature_list_str": ["value1", "value2", "value3"],
        "feature_int": 42,
        "feature_float": 3.14,
        "feature_num": 2.71,
        "feature_datetime": datetime(2024, 1, 1, 12, 0, 0),
        "feature_date": date(2024, 1, 1),
        "feature_dict": {"key": "value", "number": 123, "list": [1, 2, 3]},
        "feature_type1": "entity1",
        "feature_type1s": ["entity1", "entity2"],
        "feature_ulabel": "test-ulabel",
        "feature_user": ln.setup.settings.user.handle,
        "feature_project": "test_project",
        "feature_cell_line": "HEK293",
        # allowed if observational unit not specified, comes from aggregation
        "feature_cell_line_pass_list": ["HEK293", "A-549"],
        "feature_cell_lines": ["HEK293", "A-549"],
        "feature_cl_ontology_id": "CVCL_0045",
        "feature_artifact": "test-artifact",
        "feature_artifact_2": "value_artifact.toml",
        "feature_run": run.uid,
    }

    test_artifact.features.add_values(test_values)

    # ManyToMany accessors
    assert set(test_artifact.artifacts.to_list()) == {test_artifact, value_artifact}
    assert set(value_artifact.linked_by_artifacts.to_list()) == {test_artifact}
    assert set(test_artifact.linked_by_artifacts.to_list()) == {test_artifact}
    assert value_artifact.artifacts.to_list() == []

    # get_values accessor
    return_values = test_artifact.features.get_values()

    # special handling if passing a list of categories to a cat feature: it's interpreted as the result of an aggregation
    # hence upon retrieval it's a set of categories, not a list of categories
    values_pass_list = return_values.pop("feature_cell_line_pass_list")
    assert values_pass_list == set(test_values.pop("feature_cell_line_pass_list"))
    assert return_values == test_values

    # __get_item__ accessor
    assert test_artifact.features["feature_str"] == test_values["feature_str"]
    assert test_artifact.features["feature_list_str"] == test_values["feature_list_str"]
    assert test_artifact.features["feature_int"] == test_values["feature_int"]
    assert test_artifact.features["feature_float"] == test_values["feature_float"]
    assert test_artifact.features["feature_num"] == test_values["feature_num"]
    assert test_artifact.features["feature_datetime"] == test_values["feature_datetime"]
    assert test_artifact.features["feature_date"] == test_values["feature_date"]
    assert test_artifact.features["feature_dict"] == test_values["feature_dict"]
    assert test_artifact.features["feature_type1"] == record_entity1
    assert set(test_artifact.features["feature_type1s"]) == {
        record_entity1,
        record_entity2,
    }
    assert test_artifact.features["feature_ulabel"] == ulabel
    assert (
        test_artifact.features["feature_user"].handle == ln.setup.settings.user.handle
    )
    assert test_artifact.features["feature_project"] == test_project
    assert test_artifact.features["feature_cell_line"] == hek293
    assert test_artifact.features["feature_cl_ontology_id"] == hek293
    value = test_artifact.features["feature_cell_line_pass_list"]
    assert set(value) == {hek293, a549}
    assert isinstance(value, BasicQuerySet)
    value = test_artifact.features["feature_cell_lines"]
    assert set(value) == {hek293, a549}
    assert isinstance(value, SQLRecordList)
    assert test_artifact.features["feature_artifact"] == test_artifact
    assert test_artifact.features["feature_artifact_2"] == value_artifact
    assert test_artifact.features["feature_run"] == run

    # --- Query by features (same data as above) ---
    # Equality
    assert ln.Artifact.filter(feature_str="a string value").one() == test_artifact
    assert ln.Artifact.filter(feature_int=42).one() == test_artifact
    assert ln.Artifact.filter(feature_type1="entity1").one() == test_artifact
    assert ln.Artifact.filter(feature_cell_line="HEK293").one() == test_artifact
    assert (
        ln.Artifact.filter(feature_str="a string value", feature_int=42).one()
        == test_artifact
    )
    # Datetime and date (filter uses ISO strings as stored in JSON)
    assert (
        ln.Artifact.filter(feature_datetime="2024-01-01T12:00:00").one()
        == test_artifact
    )
    assert ln.Artifact.filter(feature_date="2024-01-01").one() == test_artifact
    # __contains (categorical)
    assert ln.Artifact.filter(feature_cell_line__contains="HEK").one() == test_artifact
    assert ln.Artifact.filter(feature_type1__contains="entity").one() == test_artifact
    # Invalid field
    with pytest.raises(ln.errors.InvalidArgument) as error:
        ln.Artifact.filter(feature_str_typo="x", feature_int=42).one()
    assert error.exconly().startswith(
        "lamindb.errors.InvalidArgument: You can query either by available fields:"
    )
    # ln.errors.ObjectDoesNotExist (no object named "nonexistent_entity" exists)
    with pytest.raises(ln.errors.ObjectDoesNotExist) as error:
        ln.Artifact.filter(feature_type1="nonexistent_entity").one()
    assert "Did not find" in error.exconly()

    # Combined filter (3 keys)
    assert (
        ln.Artifact.filter(
            feature_str="a string value",
            feature_int=42,
            feature_type1="entity1",
        ).one()
        == test_artifact
    )
    # Bionty: filter by record
    assert ln.Artifact.filter(feature_cell_line=hek293).one() == test_artifact
    # Bionty: filter by ontology_id string
    assert ln.Artifact.filter(feature_cl_ontology_id="CVCL_0045").one() == test_artifact
    # Bionty __contains (ontology_id)
    assert (
        ln.Artifact.filter(feature_cl_ontology_id__contains="0045").one()
        == test_artifact
    )
    # ln.errors.ObjectDoesNotExist (object not found: feature_project)
    with pytest.raises(ln.errors.ObjectDoesNotExist) as error:
        ln.Artifact.filter(feature_project="nonexistent_project").one()
    assert "Did not find" in error.exconly()
    # __contains returns multiple (add second artifact, assert, then remove)
    value_artifact.features.add_values({"feature_type1": "entity2"})
    assert len(ln.Artifact.filter(feature_type1__contains="entity")) == 2
    value_artifact.features.remove_values("feature_type1")
    # Numeric comparators __lt, __gt (int, float, num)
    assert ln.Artifact.filter(feature_int__lt=21).one_or_none() is None
    assert len(ln.Artifact.filter(feature_int__gt=21)) >= 1
    # int __lt/__gt that would fail with string comparison (42 vs 5, 42 vs 100)
    assert ln.Artifact.filter(feature_int__lt=5).one_or_none() is None
    assert ln.Artifact.filter(feature_int__gt=100).one_or_none() is None
    # float/num __lt/__gt (numeric comparison on SQLite via json_extract + CAST)
    assert ln.Artifact.filter(feature_float__lt=5.0).one() == test_artifact
    assert ln.Artifact.filter(feature_float__gt=1.0).one() == test_artifact
    assert ln.Artifact.filter(feature_float__gt=10.0).one_or_none() is None
    assert ln.Artifact.filter(feature_num__lt=5.0).one() == test_artifact
    assert ln.Artifact.filter(feature_num__gt=1.0).one() == test_artifact
    assert ln.Artifact.filter(feature_num__gt=10.0).one_or_none() is None
    # Date and datetime comparators (ISO strings)
    assert ln.Artifact.filter(feature_date__lt="2024-01-02").one() == test_artifact
    assert ln.Artifact.filter(feature_date__gt="2023-12-31").one() == test_artifact
    assert ln.Artifact.filter(feature_date__gt="2024-01-02").one_or_none() is None
    assert (
        ln.Artifact.filter(feature_datetime__lt="2024-01-01T13:00:00").one()
        == test_artifact
    )
    assert (
        ln.Artifact.filter(feature_datetime__gt="2024-01-01T11:00:00").one()
        == test_artifact
    )
    assert (
        ln.Artifact.filter(feature_datetime__lt="2024-01-01T11:00:00").one_or_none()
        is None
    )

    # remove values

    # this was already popped from test_values above
    test_artifact.features.remove_values("feature_cell_line_pass_list")

    test_artifact.features.remove_values("feature_int")
    test_values.pop("feature_int")
    test_artifact.features.remove_values("feature_float")
    test_values.pop("feature_float")
    test_artifact.features.remove_values("feature_num")
    test_values.pop("feature_num")
    assert test_artifact.features.get_values() == test_values

    test_artifact.features.remove_values("feature_date")
    test_values.pop("feature_date")
    assert test_artifact.features.get_values() == test_values

    test_artifact.features.remove_values("feature_type1")
    test_values.pop("feature_type1")
    assert test_artifact.features.get_values() == test_values

    test_artifact.features.remove_values("feature_type1s")
    test_values.pop("feature_type1s")
    assert test_artifact.features.get_values() == test_values

    test_artifact.features.remove_values("feature_ulabel")
    test_values.pop("feature_ulabel")
    assert test_artifact.features.get_values() == test_values

    # test passing a list to remove_values

    test_artifact.features.remove_values(["feature_cell_line", "feature_user"])
    test_values.pop("feature_cell_line")
    test_values.pop("feature_user")
    assert test_artifact.features.get_values() == test_values

    test_artifact.features.remove_values("feature_artifact")
    test_values.pop("feature_artifact")
    assert test_artifact.features.get_values() == test_values

    test_artifact.features.remove_values("feature_run")
    test_values.pop("feature_run")
    assert test_artifact.features.get_values() == test_values

    # test passing None has no effect, does not lead to annotation

    test_artifact.features.add_values(
        {
            "feature_int": None,
            "feature_float": None,
            "feature_num": None,
            "feature_type1": None,
        }
    )
    assert test_artifact.features.get_values() == test_values

    # test bulk removal

    assert list(test_values.keys()) == [
        "feature_str",
        "feature_list_str",
        "feature_datetime",
        "feature_dict",
        "feature_project",
        "feature_cell_lines",
        "feature_cl_ontology_id",
        "feature_artifact_2",
    ]
    test_artifact.features.remove_values()
    test_values = {}
    assert test_artifact.features.get_values() == test_values

    # test passing ISO-format date string for date

    test_artifact.features.add_values({"feature_date": "2024-01-01"})
    test_values["feature_date"] = date(2024, 1, 1)
    assert test_artifact.features.get_values() == test_values

    # test passing bionty objects instead of strings (using gene1 and gene2 because organism-dependent ontologies)
    test_artifact.features.add_values({"feature_gene_ontology_id": [gene1, gene2]})
    test_values["feature_gene_ontology_id"] = {"ENSG00000139618", "ENSG00000141510"}
    assert test_artifact.features.get_values() == test_values
    test_values.pop("feature_gene_ontology_id")
    test_artifact.features.remove_values("feature_gene_ontology_id")

    # test add_values() when there is already something there

    test_artifact.features.add_values({"feature_date": "2024-02-01"})
    test_values["feature_date"] = {date(2024, 1, 1), date(2024, 2, 1)}
    test_artifact.features.add_values({"feature_str": "a string value"})
    test_values["feature_str"] = "a string value"
    assert test_artifact.features.get_values() == test_values

    # test set_values()

    test_values = {}
    test_values["feature_date"] = date(2024, 3, 1)
    test_artifact.features.set_values({"feature_date": "2024-03-01"})
    assert test_artifact.features.get_values() == test_values

    # schema validation

    feature_str = ln.Feature.get(name="feature_str")
    feature_int = ln.Feature.get(name="feature_int")
    schema = ln.Schema([feature_str, feature_int], name="test_schema").save()
    with pytest.raises(ln.errors.ValidationError) as error:
        test_artifact.features.add_values({"feature_type1": "entity1"}, schema=schema)
    assert "COLUMN_NOT_IN_DATAFRAME" in error.exconly()
    schema.delete(permanent=True)

    # test with list of strings

    schema = ln.Schema([feature_cell_lines], name="test_schema2").save()
    test_artifact.features.add_values(
        {"feature_cell_lines": ["HEK293", "A-549"]}, schema=schema
    )
    schema.delete(permanent=True)

    # test with list of records (rather than passing strings)

    schema = ln.Schema([feature_cell_lines], name="test_schema2").save()
    test_artifact.features.add_values(
        {"feature_cell_lines": [a549, hek293]}, schema=schema
    )
    schema.delete(permanent=True)

    # clean up rest

    test_artifact.delete(permanent=True)
    feature_str.delete(permanent=True)
    feature_list_str.delete(permanent=True)
    feature_int.delete(permanent=True)
    feature_float.delete(permanent=True)
    feature_num.delete(permanent=True)
    feature_datetime.delete(permanent=True)
    feature_date.delete(permanent=True)
    feature_type1.delete(permanent=True)
    feature_type1s.delete(permanent=True)
    feature_user.delete(permanent=True)
    feature_project.delete(permanent=True)
    feature_dict.delete(permanent=True)
    feature_artifact.delete(permanent=True)
    feature_artifact_2.delete(permanent=True)
    feature_run.delete(permanent=True)
    feature_ulabel.delete(permanent=True)
    feature_cell_lines.delete(permanent=True)
    record_entity1.delete(permanent=True)
    record_entity2.delete(permanent=True)
    record_type1.delete(permanent=True)
    test_project.delete(permanent=True)
    feature_cell_line.delete(permanent=True)
    feature_cl_ontology_id.delete(permanent=True)
    feature_gene_ontology_id.delete(permanent=True)
    hek293.delete(permanent=True)
    a549.delete(permanent=True)
    gene1.delete(permanent=True)
    gene2.delete(permanent=True)
    ulabel.delete(permanent=True)
    artifact.delete(permanent=True)
    run.delete(permanent=True)
    transform.delete(permanent=True)


def test_features_name_duplicates_across_root_and_nested():
    feature1 = ln.Feature(name="sample_name", dtype=ln.Record).save()
    lab_a_type = ln.Feature(name="LabA", is_type=True).save()
    feature2 = ln.Feature(name="sample_name", dtype=ln.Record, type=lab_a_type).save()
    record_sample = ln.Record(name="sample").save()
    test_artifact = ln.Artifact(".gitignore", key="test_artifact").save()
    test_artifact.features.add_values({"sample_name": "sample"})
    assert test_artifact.features.get_values() == {"sample_name": "sample"}
    test_artifact.delete(permanent=True)
    record_sample.delete(permanent=True)
    feature1.delete(permanent=True)
    feature2.delete(permanent=True)
    lab_a_type.delete(permanent=True)


# also see test_curator_schema_feature_mapping
def test_features_name_duplicates_across_equal_levels():
    lab_a_type = ln.Feature(name="LabA", is_type=True).save()
    feature1 = ln.Feature(name="sample_name", dtype=ln.Record, type=lab_a_type).save()
    lab_b_type = ln.Feature(name="LabB", is_type=True).save()
    feature2 = ln.Feature(name="sample_name", dtype=ln.Record, type=lab_b_type).save()
    schema1 = ln.Schema([feature1], name="Lab A schema").save()
    record_sample = ln.Record(name="sample").save()
    test_artifact = ln.Artifact(".gitignore", key="test_artifact").save()

    # cannot disambiguate without schema
    with pytest.raises(ln.errors.ValidationError) as error:
        test_artifact.features.add_values({"sample_name": "sample"})
    assert (
        "Ambiguous match for Feature 'sample_name': found 2 features at depth 1 (under types: ['LabA', 'LabB'])"
        in error.exconly()
    )

    # with schema, first one
    test_artifact.features.add_values({"sample_name": "sample"}, schema=schema1)
    assert test_artifact.features.get_values() == {"sample_name": "sample"}
    assert test_artifact.links_record.get().feature.type == lab_a_type

    test_artifact.delete(permanent=True)
    test_artifact = ln.Artifact(".gitignore", key="test_artifact").save()

    # now the other schema
    schema2 = ln.Schema([feature2], name="Lab B schema").save()
    test_artifact.features.add_values({"sample_name": "sample"}, schema=schema2)
    assert test_artifact.features.get_values() == {"sample_name": "sample"}
    assert test_artifact.links_record.get().feature.type == lab_b_type

    test_artifact.delete(permanent=True)
    record_sample.delete(permanent=True)
    schema2.delete(permanent=True)
    schema1.delete(permanent=True)
    feature1.delete(permanent=True)
    feature2.delete(permanent=True)
    lab_a_type.delete(permanent=True)
    lab_b_type.delete(permanent=True)


def test_feature_predicate_queries_safe_hybrid():
    lab_a_type = ln.Feature(name="PredLabA", is_type=True).save()
    feature_a = ln.Feature(name="pred_name", dtype=str, type=lab_a_type).save()
    lab_b_type = ln.Feature(name="PredLabB", is_type=True).save()
    feature_b = ln.Feature(name="pred_name", dtype=str, type=lab_b_type).save()
    score_feature = ln.Feature(name="pred_score", dtype=int).save()
    cell_type_feature = ln.Feature(name="pred_cell_type", dtype=bt.CellLine).save()

    # safe hybrid behavior for model identity + hashability
    assert feature_a == feature_a
    assert feature_a != feature_b
    assert len({feature_a, feature_b}) == 2

    schema_a = ln.Schema([feature_a], name="pred schema a").save()
    schema_b = ln.Schema([feature_b], name="pred schema b").save()

    artifact_a = ln.Artifact(
        ".gitignore",
        key="pred-artifact-a",
        skip_hash_lookup=True,
    ).save()
    artifact_b = ln.Artifact(
        ".gitignore",
        key="pred-artifact-b",
        skip_hash_lookup=True,
    ).save()
    artifact_a.features.add_values({"pred_name": "hello"}, schema=schema_a)
    artifact_b.features.add_values({"pred_name": "hello"}, schema=schema_b)
    artifact_a.features.add_values({"pred_score": 5})
    artifact_b.features.add_values({"pred_score": 1})
    hek293 = bt.CellLine.from_source(name="HEK293").save()
    artifact_a.features.add_values({"pred_cell_type": hek293})

    # same feature name can be disambiguated by passing the Feature object
    assert ln.Artifact.filter(feature_a == "hello").one() == artifact_a
    assert ln.Artifact.filter(feature_b == "hello").one() == artifact_b
    # Feature compared to another model should still generate a predicate
    assert ln.Artifact.filter(cell_type_feature == hek293).one() == artifact_a

    # comparator operators on non-categorical feature values
    assert ln.Artifact.filter(score_feature > 2).one() == artifact_a
    assert ln.Artifact.filter(score_feature <= 1).one() == artifact_b
    neq_results = ln.Artifact.filter(score_feature != 5)
    assert artifact_b in neq_results
    assert artifact_a not in neq_results

    # mixed predicate and regular kwargs filters
    assert (
        ln.Artifact.filter(feature_a == "hello", key="pred-artifact-a").one()
        == artifact_a
    )

    artifact_a.delete(permanent=True)
    artifact_b.delete(permanent=True)
    schema_a.delete(permanent=True)
    schema_b.delete(permanent=True)
    feature_a.delete(permanent=True)
    feature_b.delete(permanent=True)
    score_feature.delete(permanent=True)
    cell_type_feature.delete(permanent=True)
    lab_a_type.delete(permanent=True)
    lab_b_type.delete(permanent=True)
    hek293.delete(permanent=True)


def test_features_add_with_schema():
    df = mini_immuno.get_dataset1(otype="DataFrame")
    artifact = ln.Artifact.from_dataframe(df, description="test dataset").save()

    species = ln.Feature(name="species", dtype="str").save()
    split = ln.Feature(name="split", dtype="str").save()
    schema = ln.Schema([species, split]).save()

    with pytest.raises(ln.errors.ValidationError) as e:
        artifact.features.add_values({"doesnot": "exist"}, schema=schema)
    assert "column 'split' not in dataframe" in str(e.value)

    artifact.features.add_values({"species": "bird", "split": "train"}, schema=schema)
    artifact.save()

    assert artifact.features.get_values() == {"species": "bird", "split": "train"}

    artifact.delete(permanent=True)
    schema.delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


def test_artifact_feature_cat_filters_schema_end_to_end():
    schema_feature = ln.Feature(name="schema_filter_column_e2e", dtype=str).save()
    required_schema = ln.Schema(
        name="required_schema_for_artifact_filter",
        features=[schema_feature],
    ).save()
    artifact_feature = ln.Feature(
        name="input_artifact",
        dtype=ln.Artifact,
        cat_filters={"schema": required_schema},
    ).save()
    container_artifact = ln.Artifact(
        ".gitignore",
        key="container_for_artifact_schema_filter",
        skip_hash_lookup=True,
    ).save()
    artifact_without_schema = ln.Artifact(
        ".gitignore",
        key="artifact_without_required_schema",
        skip_hash_lookup=True,
    ).save()
    artifact_with_schema = ln.Artifact(
        ".gitignore",
        key="artifact_with_required_schema",
        schema=required_schema,
        skip_hash_lookup=True,
    ).save()

    try:
        with pytest.raises(ln.errors.ValidationError) as error:
            container_artifact.features.add_values(
                {"input_artifact": artifact_without_schema.key}
            )
        assert "1 term not validated in feature 'input_artifact'" in error.exconly()

        container_artifact.features.add_values(
            {"input_artifact": artifact_with_schema.key}
        )
        assert container_artifact.features["input_artifact"] == artifact_with_schema
    finally:
        container_artifact.delete(permanent=True)
        artifact_without_schema.delete(permanent=True)
        artifact_with_schema.delete(permanent=True)
        artifact_feature.delete(permanent=True)
        required_schema.delete(permanent=True)
        schema_feature.delete(permanent=True)


def test_features_add_remove_error_behavior():
    """Add/remove/validation behavior."""
    adata = ln.examples.datasets.anndata_with_obs()
    artifact = ln.Artifact.from_anndata(adata, description="test").save()
    with pytest.raises(ln.errors.ValidationError) as error:
        artifact.features.add_values({"experiment": "Experiment 1"})
    assert (
        error.exconly()
        == """lamindb.errors.ValidationError: These keys could not be validated: ['experiment']
Here is how to create a feature:

  ln.Feature(name='experiment', dtype='cat ? str').save()"""
    )
    ln.Feature(name="experiment", dtype=ln.Record).save()
    with pytest.raises(ln.errors.ValidationError) as error:
        artifact.features.add_values({"experiment": "Experiment 1"})
    assert error.exconly().startswith(
        "lamindb.errors.ValidationError: 1 term not validated in feature 'experiment'"
    )
    ln.Record(name="Experiment 1").save()
    # now add the label with the feature and make sure that it has the feature annotation
    artifact.features.add_values({"experiment": "Experiment 1"})
    assert artifact.links_record.get().record.name == "Experiment 1"
    assert artifact.links_record.get().feature.name == "experiment"
    # repeat
    artifact.features.add_values({"experiment": "Experiment 1"})
    assert artifact.links_record.get().record.name == "Experiment 1"

    # numerical feature
    temperature = ln.Feature(name="temperature", dtype=ln.Record).save()
    with pytest.raises(TypeError) as error:
        artifact.features.add_values({"temperature": 27.2})
    assert error.exconly().startswith(
        "TypeError: Type mismatch: identifiers are 'numeric' but field_values are 'str/categorical'."
    )
    temperature.delete(permanent=True)
    temperature = ln.Feature(name="temperature", dtype="num").save()
    artifact.features.add_values({"temperature": 27.2})
    assert artifact.json_values.first().value == 27.2

    # datetime feature
    with pytest.raises(ln.errors.ValidationError) as error:
        artifact.features.add_values({"date_of_experiment": "2024-12-01"})
    assert (
        error.exconly()
        == """lamindb.errors.ValidationError: These keys could not be validated: ['date_of_experiment']
Here is how to create a feature:

  ln.Feature(name='date_of_experiment', dtype='date').save()"""
    )

    ln.Feature(name="date_of_experiment", dtype=datetime.date, coerce=True).save()
    with pytest.raises(ln.errors.ValidationError) as error:
        artifact.features.add_values({"date_of_experiment": "Typo2024-12-01"})
    assert "WRONG_DATATYPE" in error.exconly()
    artifact.features.add_values({"date_of_experiment": "2024-12-01"})

    ln.Feature(name="datetime_of_experiment", dtype=datetime, coerce=True).save()
    artifact.features.add_values({"datetime_of_experiment": "2024-12-01 00:00:00"})

    # bionty feature
    mouse = bt.Organism.from_source(name="mouse")
    with pytest.raises(ln.errors.ValidationError) as error:
        artifact.features.add_values({"organism": mouse})
    assert (
        error.exconly()
        == """lamindb.errors.ValidationError: These keys could not be validated: ['organism']
Here is how to create a feature:

  ln.Feature(name='organism', dtype='cat[bionty.Organism]').save()"""
    )
    ln.Feature(name="organism", dtype=bt.Organism).save()
    with pytest.raises(ln.errors.ValidationError) as error:
        artifact.features.add_values({"organism": mouse})
    assert (
        # ensure the label is saved
        error.exconly()
        == "lamindb.errors.ValidationError: Organism mouse is not saved."
    )
    mouse.save()
    artifact.features.add_values({"organism": mouse})
    assert artifact.organisms.get().name == "mouse"

    # lists of records
    diseases = bt.Disease.from_values(
        ["MONDO:0004975", "MONDO:0004980"], field=bt.Disease.ontology_id
    ).save()
    ln.Feature(name="disease", dtype=bt.Disease.ontology_id).save()
    artifact.features.add_values({"disease": diseases})
    assert len(artifact.diseases.filter()) == 2
    # check get_values returns ontology_ids as specified in the feature dtype
    assert artifact.features.get_values()["disease"] == {
        "MONDO:0004975",
        "MONDO:0004980",
    }

    # big dictionary of everything
    features = {
        "experiment": [  # we're testing iterable annotation here
            "Experiment 2",
            "Experiment 1",
        ],
        "project": "project_1",
        "is_validated": True,
        "cell_type_by_expert": "T cell",
        "temperature": 100.0,
        "donor": "U0123",
    }
    with pytest.raises(ln.errors.ValidationError) as error:
        artifact.features.add_values(features)
    assert (
        error.exconly()
        == """\
lamindb.errors.ValidationError: These keys could not be validated: ['project', 'is_validated', 'cell_type_by_expert', 'donor']
Here is how to create a feature:

  ln.Feature(name='project', dtype='cat ? str').save()
  ln.Feature(name='is_validated', dtype='bool').save()
  ln.Feature(name='cell_type_by_expert', dtype='cat ? str').save()
  ln.Feature(name='donor', dtype='cat ? str').save()"""
    )

    ln.Feature(name="project", dtype=ln.Record).save()
    ln.Feature(name="is_validated", dtype=bool).save()
    ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save()
    ln.Feature(name="donor", dtype=ln.Record).save()

    with pytest.raises(ln.errors.ValidationError) as error:
        artifact.features.add_values(features)
        error_msg = error.exconly()

        assert (
            "lamindb.errors.ValidationError: These values could not be validated:"
            in error_msg
        )
        assert "Here is how to create records for them:" in error_msg

        expected_values = {
            "Record": ["project_1", "U0123", "Experiment 2"],
            "bionty.CellType": ["T cell"],
        }

        for key, values in expected_values.items():
            assert f"'{key}':" in error_msg
            for value in values:
                assert value in error_msg
            assert f"{key.split('.')[-1]}.from_values(" in error_msg

        assert "create=True).save()" in error_msg

    ln.Record.from_values(["Experiment 2", "project_1", "U0123"], create=True).save()
    bt.CellType.from_source(name="T cell").save()

    artifact.features.add_values(features)
    assert set(artifact.json_values.all().values_list("value", flat=True)) == {
        27.2,
        True,
        100.0,
        "2024-12-01",
        "2024-12-01T00:00:00",
    }

    assert ln.Artifact.get(json_values__value=27.2)

    assert artifact.features.get_values() == {
        "disease": {"MONDO:0004975", "MONDO:0004980"},
        "experiment": {"Experiment 1", "Experiment 2"},
        "project": "project_1",
        "cell_type_by_expert": "T cell",
        "donor": "U0123",
        "organism": "mouse",
        "is_validated": True,
        "temperature": {27.2, 100.0},
        "date_of_experiment": date(2024, 12, 1),
        "datetime_of_experiment": datetime(2024, 12, 1, 0, 0, 0),
    }
    # hard to test because of italic formatting
    assert (
        artifact.features.describe(return_str=True)
        == """Artifact:  (0000)
|   description: test
└── Features
    └── cell_type_by_expe…  bionty.CellType          T cell
        disease             bionty.Disease.ontolog…  MONDO:0004975, MONDO:00049…
        donor               Record                   U0123
        experiment          Record                   Experiment 1, Experiment 2
        organism            bionty.Organism          mouse
        project             Record                   project_1
        date_of_experiment  date                     2024-12-01
        datetime_of_exper…  datetime                 2024-12-01 00:00:00
        is_validated        bool                     True
        temperature         num                      27.2, 100.0"""
    )

    # repeat
    artifact.features.add_values(features)
    assert set(artifact.json_values.all().values_list("value", flat=True)) == {
        27.2,
        True,
        100.0,
        "2024-12-01",
        "2024-12-01T00:00:00",
    }

    # test remove_values
    artifact.features.remove_values("date_of_experiment")
    alzheimer = bt.Disease.get(name="Alzheimer disease")
    artifact.features.remove_values("disease", value=alzheimer)
    values = artifact.features.get_values()
    assert "date_of_experiment" not in values
    assert "MONDO:0004975" not in values["disease"]

    # test annotate with dictionaries multiple times
    ln.Feature(name="study_metadata", dtype=dict).save()
    artifact.features.add_values({"study_metadata": {"detail1": "123", "detail2": 1}})

    # delete everything we created
    artifact.delete(permanent=True)
    ln.Record.filter().delete(permanent=True)
    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)
    bt.Gene.filter().delete(permanent=True)
    bt.Organism.filter().delete(permanent=True)
    bt.Disease.filter().delete(permanent=True)


def test_add_remove_list_features(ccaplog):
    feature = ln.Feature(name="list_of_str", dtype=list[str]).save()
    artifact = ln.Artifact(".gitignore", key=".gitignore").save()
    artifact.features.add_values({"list_of_str": ["1", "2", "3"]})
    assert artifact.features.get_values() == {"list_of_str": ["1", "2", "3"]}
    # remove a non-linked value, this should do nothing but print a warning
    artifact.features.remove_values("list_of_str", value="4")
    assert "no feature 'list_of_str' with value '4' found" in ccaplog.text
    # list of categories feature
    cell_types_feature = ln.Feature(
        name="cell_types", dtype="list[cat[bionty.CellType]]"
    ).save()
    bt.CellType.from_values(["T cell", "B cell"]).save()
    artifact.features.add_values({"cell_types": ["T cell", "B cell"]})
    assert set(artifact.features.get_values()["cell_types"]) == {"B cell", "T cell"}
    # passing value works here because we are linking each of the cell types in the list individually
    # in comparison to passing a list of numbers above
    t_cell = bt.CellType.get(name="T cell")
    artifact.features.remove_values("cell_types", value=t_cell)
    assert artifact.features.get_values()["cell_types"] == ["B cell"]
    # remove a non-linked value, this should print a warning but do nothing
    artifact.features.remove_values("cell_types", value=t_cell.parents.first())
    assert "no feature 'cell_types' with value CellType(" in ccaplog.text
    # remove the entire linked feature
    artifact.features.remove_values("cell_types")
    assert "cell_types" not in artifact.features.get_values()

    # clean up
    artifact.delete(permanent=True)
    assert ln.models.JsonValue.filter(feature__name="list_of_str").count() == 1
    feature.delete(permanent=True)
    assert ln.models.JsonValue.filter(feature__name="list_of_str").count() == 0
    cell_types_feature.delete(permanent=True)
    bt.CellType.filter().delete(permanent=True)


def test_add_list_of_cat_features():
    type_1 = ln.Record(name="Type 1", is_type=True).save()
    for label in ["label 1", "label 2", "label 3"]:
        ln.Record(name=label, type=type_1).save()
    feat1 = ln.Feature(
        name="single_label_of_type1", dtype=type_1, nullable=False
    ).save()
    feat2 = ln.Feature(
        name="list_of_labels_of_type1", dtype=list[type_1], nullable=False
    ).save()
    schema = ln.Schema(name="Test schema", features=[feat1, feat2]).save()
    artifact = ln.Artifact(
        ".gitignore",
        key=".gitignore",
    ).save()
    # now just use add_values()
    with pytest.raises(ln.errors.ValidationError) as error:
        artifact.features.add_values(
            {
                "single_label_of_type1": "invalid",
            }
        )
    assert error.exconly().startswith(
        "lamindb.errors.ValidationError: 1 term not validated in feature 'single_label_of_type1': 'invalid'"
    )
    # now for list of labels
    with pytest.raises(ln.errors.ValidationError) as error:
        artifact.features.add_values(
            {
                "list_of_labels_of_type1": ["invalid", "invalid2"],
            }
        )
    assert error.exconly().startswith(
        "lamindb.errors.ValidationError: 2 terms not validated in feature 'list_of_labels_of_type1':"
    )
    artifact.delete(permanent=True)
    # now with schema
    artifact = ln.Artifact(
        ".gitignore",
        key=".gitignore",
        schema=schema,
        features={
            "single_label_of_type1": "label 1",
            "list_of_labels_of_type1": ["label 1", "label 2"],
        },
    ).save()
    with pytest.raises(ValueError) as error:
        artifact.features.add_values(
            {
                "single_label_of_type1": "invalid",
            }
        )
    assert "Cannot add values if artifact has external schema." in error.exconly()

    artifact.delete(permanent=True)
    schema.delete(permanent=True)
    feat1.delete(permanent=True)
    feat2.delete(permanent=True)
    type_1.records.all().delete(permanent=True)
    type_1.delete(permanent=True)


def test_artifact_features_accept_feature_object_keys():
    feature_score = ln.Feature(name="artifact_feature_object_score", dtype=int).save()
    feature_tag = ln.Feature(name="artifact_feature_object_tag", dtype=str).save()
    artifact = ln.Artifact(".gitignore", key="artifact_feature_object_test").save()

    artifact.features.add_values({feature_score: 7, "artifact_feature_object_tag": "a"})
    assert artifact.features.get_values() == {
        "artifact_feature_object_score": 7,
        "artifact_feature_object_tag": "a",
    }

    # set_values should also accept Feature objects as dictionary keys.
    artifact.features.set_values({feature_score: 8})
    assert artifact.features.get_values() == {"artifact_feature_object_score": 8}

    artifact.features.add_values({feature_tag: "keep"})
    assert artifact.features.get_values() == {
        "artifact_feature_object_score": 8,
        "artifact_feature_object_tag": "keep",
    }

    # remove_values supports dictionary inputs with Feature keys.
    artifact.features.remove_values({feature_score: 8, feature_tag: None})
    assert artifact.features.get_values() == {}

    artifact.delete(permanent=True)
    feature_score.delete(permanent=True)
    feature_tag.delete(permanent=True)


================================================
FILE: tests/core/test_artifact_parquet.py
================================================
import lamindb as ln
import pandas as pd
import pyarrow.parquet as pq


def test_parquet_kwargs():
    df = pd.DataFrame(
        {
            "a": [3, 1, 4, 2],
            "b": ["c", "a", "d", "b"],
            "c": [3.3, 1.1, 4.4, 2.2],
        }
    )
    df_sorted = df.sort_values(by=["a", "b"])
    sorting_columns = [
        pq.SortingColumn(0, descending=False, nulls_first=False),
        pq.SortingColumn(1, descending=False, nulls_first=False),
    ]
    artifact = ln.Artifact.from_dataframe(
        df_sorted,
        key="df_sorted.parquet",
        parquet_kwargs={"sorting_columns": sorting_columns},
    ).save()
    pyarrow_dataset = artifact.open()
    fragment = next(pyarrow_dataset.get_fragments())
    assert list(fragment.metadata.row_group(0).sorting_columns) == sorting_columns


================================================
FILE: tests/core/test_blocks.py
================================================
import lamindb as ln
import pytest


def test_block_recovery_based_on_hash():
    block1 = ln.models.Block(key="__lamindb_block__", content="1", kind="readme").save()
    block2 = ln.models.Block(key="__lamindb_block__", content="1", kind="readme")
    assert block1 == block2
    block1.delete()
    block2 = ln.models.Block(key="__lamindb_block__", content="1", kind="readme")
    assert block1 != block2
    block1.delete(permanent=True)


def test_block_recovery_based_on_key():
    block1 = ln.models.Block(key="__lamindb_block__", kind="readme").save()
    block2 = ln.models.Block(key="__lamindb_block__", kind="readme")
    assert block1 == block2
    block1.delete()
    block2 = ln.models.Block(key="__lamindb_block__", kind="readme")
    assert block1 != block2
    block1.delete(permanent=True)


def test_readme_md_key_is_allowed_and_revises():
    block1 = ln.models.Block(
        key="README.md", content="# v1\n\nhello", kind="readme"
    ).save()
    block2 = ln.models.Block(key="README.md", content="# v2\n\nhello", kind="readme")
    assert block2.stem_uid == block1.stem_uid
    assert block2.uid != block1.uid
    block2.save()
    block1.refresh_from_db()
    assert not block1.is_latest
    block2.delete()
    block1.delete()


def test_revise_blocks():
    # attempt to create a block with an invalid version
    with pytest.raises(ValueError) as error:
        ln.models.Block(key="__lamindb_block__", version=0, kind="readme")
    assert "version" in error.exconly() or "version_tag" in error.exconly()

    # create a versioned block
    block = ln.models.Block(key="__lamindb_block__", version="1", kind="readme")
    assert block.version_tag == "1"
    assert block.version == "1"
    assert len(block.uid) == ln.models.Block._len_full_uid == 20
    assert len(block.stem_uid) == ln.models.Block._len_stem_uid == 16

    block.save()

    # try to reload the same block with the same uid
    block_reload = ln.models.Block(
        uid=block.uid, key="__lamindb_artifact__", kind="readme"
    )
    assert block_reload.id == block.id
    assert block_reload.key == "__lamindb_block__"  # unchanged, prints logging

    # create new block from old block
    block_r2 = ln.models.Block(content="v2", revises=block, kind="readme")
    assert block_r2.uid != block.uid
    assert block_r2.uid.endswith("0001")
    block_r2 = ln.models.Block(content="v2", revises=block, kind="readme")
    assert block_r2.uid != block.uid
    assert block_r2.uid.endswith("0001")
    assert block_r2.stem_uid == block.stem_uid
    assert block_r2.version_tag is None
    assert block_r2.version == block_r2.uid[-4:]
    assert block_r2.is_latest
    assert block.is_latest
    block_r2.save()
    assert not block.is_latest

    # create new block from newly versioned block
    block_r3 = ln.models.Block(
        content="v3", revises=block_r2, version="2", kind="readme"
    )
    assert block_r3.stem_uid == block.stem_uid
    assert block_r3.version_tag == "2"
    assert block_r3.version == "2"

    # revise by matching on key
    key = "__lamindb_artifact__"
    block_r2.key = key
    block_r2.save()
    assert block_r2.is_latest
    block_r3 = ln.models.Block(content="v3", key=key, version="2", kind="readme")
    assert block_r3.uid[:-4] == block_r2.uid[:-4]
    assert block_r3.uid != block_r2.uid  # new version after block_r2
    block_r2.content = "something else"
    block_r2.save()
    block_r3 = ln.models.Block(content="v3", key=key, version="2", kind="readme")
    assert block_r3.uid[:-4] == block_r2.uid[:-4]
    assert block_r3.uid != block_r2.uid  # yet another new version
    assert block_r3.stem_uid == block_r2.stem_uid
    assert block_r3.key == key
    assert block_r3.version_tag == "2"
    assert block_r3.version == "2"
    assert block_r3.is_latest
    assert block_r2.is_latest
    assert block_r3._revises is not None
    block_r3.save()
    block_r2 = ln.models.Block.get(block_r2.uid)
    assert not block_r2.is_latest

    # wrong block type
    with pytest.raises(TypeError) as error:
        ln.models.Block(
            key="__lamindb_block__", revises=ln.Record(name="x"), kind="readme"
        )
    assert error.exconly().startswith("TypeError: `revises` has to be of type `Block`")

    # wrong kwargs
    with pytest.raises(ValueError) as error:
        ln.models.Block(key="__lamindb_block__", x=1, kind="readme")
    assert "can be passed" in error.exconly() and "x" in error.exconly()

    # kind required (Block only supports kind="readme")
    with pytest.raises(ValueError) as error:
        ln.models.Block(key="__lamindb_block__", content="y")
    assert "kind" in error.exconly() and "readme" in error.exconly()

    # invalid kind (Block only supports readme)
    with pytest.raises(ValueError) as error:
        ln.models.Block(key="__lamindb_block__", content="y", kind="comment")
    assert "readme" in error.exconly() or "Only kind" in error.exconly()

    # cleanup
    block_r2.delete()
    block.delete()

    # unversioned block
    block = ln.models.Block(key="__lamindb_block__", kind="readme")
    assert block.version_tag is None
    assert block.version == block.uid[-4:]
    block.save()

    # create new block from old block
    new_block = ln.models.Block(content="new", revises=block, kind="readme")
    assert block.version_tag is None
    assert block.version == block.uid[-4:]
    assert new_block.stem_uid == block.stem_uid
    assert new_block.uid.endswith("0001")
    assert new_block.version_tag is None
    assert new_block.version == new_block.uid[-4:]

    block.delete(permanent=True)


def test_record_block_readme_always_new_version():
    """Readme always creates a new version (no content-hash dedup)."""
    record = ln.Record(name="test-record-blocks").save()
    block1 = ln.models.RecordBlock(record=record, content="1", kind="readme").save()
    block2 = ln.models.RecordBlock(record=record, content="1", kind="readme")
    assert block1.stem_uid == block2.stem_uid
    assert block1.uid != block2.uid  # new version each time
    block1.delete()  # BaseSQLRecord has no soft delete; this is permanent
    block2 = ln.models.RecordBlock(record=record, content="1", kind="readme")
    assert block1 != block2  # block2 is a new block (block1 was removed)
    record.delete(permanent=True)


def test_record_block_comment_always_new_block():
    """Comment always creates a new block (no versioning; revises not allowed)."""
    record = ln.Record(name="test-record-blocks-comment").save()
    # Add readme and comments to test full describe
    ln.models.RecordBlock(
        record=record, content="# Overview\n\nTest readme.", kind="readme"
    ).save()
    # Comments never version: each creation is a new comment (new uid).
    comment1 = ln.models.RecordBlock(
        record=record, content="same text", kind="comment"
    ).save()
    comment2 = ln.models.RecordBlock(record=record, content="same text", kind="comment")
    assert comment1.stem_uid != comment2.stem_uid  # always new comment, no dedup
    # revises is not allowed for kind='comment'
    with pytest.raises(ValueError) as error:
        ln.models.RecordBlock(
            record=record, content="a comment", kind="comment", revises=comment1
        )
    assert "revises is not allowed for kind='comment'" in error.exconly()

    # Test full describe call with include="comments"
    result = record.describe(return_str=True, include="comments")
    assert "README" in result
    assert "comment by" in result
    assert "same text" in result

    comment1.delete()
    record.delete(permanent=True)


def test_record_block_recovery_based_on_record_and_kind():
    record = ln.Record(name="test-record-blocks-key").save()
    block1 = ln.models.RecordBlock(record=record, kind="readme").save()
    block2 = ln.models.RecordBlock(record=record, kind="readme")
    assert block1 == block2
    block1.delete()  # BaseSQLRecord has no soft delete; this is permanent
    block2 = ln.models.RecordBlock(record=record, kind="readme")
    assert block1 != block2  # block2 is a new block (block1 was removed)
    record.delete(permanent=True)


def test_revise_record_blocks():
    record = ln.Record(name="test-record-revise").save()

    # create a versioned record block
    block = ln.models.RecordBlock(
        record=record, content="v1", kind="readme", version="1"
    )
    assert block.version_tag == "1"
    assert block.version == "1"
    assert len(block.uid) == ln.models.RecordBlock._len_full_uid == 20
    assert len(block.stem_uid) == ln.models.RecordBlock._len_stem_uid == 16
    block.save()

    # reload same block by uid
    block_reload = ln.models.RecordBlock(record=record, uid=block.uid, kind="readme")
    assert block_reload.id == block.id

    # create new block from old block
    block_r2 = ln.models.RecordBlock(
        record=record, content="v2", kind="readme", revises=block
    )
    assert block_r2.uid != block.uid
    assert block_r2.uid.endswith("0001")
    assert block_r2.stem_uid == block.stem_uid
    assert block_r2.is_latest
    assert block.is_latest
    block_r2.save()
    assert not block.is_latest

    # create new block from newly versioned block
    block_r3 = ln.models.RecordBlock(
        record=record, content="v3", kind="readme", revises=block_r2, version="2"
    )
    assert block_r3.stem_uid == block.stem_uid
    assert block_r3.version_tag == "2"
    assert block_r3.version == "2"

    # readme always creates a new version (no hash-based dedup)
    block_r3.save()  # so next readme for this record gets revises=block_r3
    block_same = ln.models.RecordBlock(record=record, content="v3", kind="readme")
    assert block_same.stem_uid == block_r3.stem_uid
    assert block_same.uid != block_r3.uid  # new version (0003)

    # comment does not accept revises
    with pytest.raises(ValueError) as error:
        ln.models.RecordBlock(
            record=record, content="a comment", kind="comment", revises=block
        )
    assert "revises is not allowed for kind='comment'" in error.exconly()

    # wrong kwargs
    with pytest.raises(ValueError) as error:
        ln.models.RecordBlock(record=record, x=1)
    assert "can be passed" in error.exconly()

    # record required
    with pytest.raises(ValueError) as error:
        ln.models.RecordBlock(content="x", kind="readme")
    assert "record is required" in error.exconly()

    block_r2.delete()
    block.delete()
    record.delete(permanent=True)


def test_record_block_filter_respects_default_branch_scope():
    main_branch = ln.Branch.get(name="main")
    ln.setup.switch(main_branch.name)

    main_record = ln.Record(name="record-block-main").save()
    ln.models.RecordBlock(
        record=main_record,
        content="record-block-main-content",
        kind="readme",
        branch=main_branch,
        created_on=main_branch,
    ).save()

    contrib = ln.Branch(name="record_block_scope_branch").save()
    ln.setup.switch(contrib.name)
    contrib_record = ln.Record(name="record-block-contrib").save()
    contrib_block = ln.models.RecordBlock(
        record=contrib_record,
        content="record-block-contrib-content",
        kind="readme",
        branch=contrib,
        created_on=contrib,
    ).save()

    assert (
        ln.models.RecordBlock.filter(content="record-block-contrib-content").count()
        == 1
    )

    ln.setup.switch(main_branch.name)
    assert (
        ln.models.RecordBlock.filter(content="record-block-contrib-content").count()
        == 0
    )

    contrib_block.delete()
    contrib_record.delete(permanent=True)
    main_record.delete(permanent=True)
    contrib.delete(permanent=True)


================================================
FILE: tests/core/test_branches.py
================================================
import lamindb as ln


def testbranch_id():
    # create a file with default branch_id
    with open("./testbranch_id.txt", "w") as f:
        f.write("branch_id")
    artifact = ln.Artifact("./testbranch_id.txt", description="testbranch_id").save()
    assert artifact.branch_id == 1

    # create a collection from file
    collection = ln.Collection(artifact, key="testbranch_id").save()

    # delete a collection will put both collection but not linked artifact in trash
    collection.delete()
    assert collection.ordered_artifacts[0].branch_id == 1
    result = ln.Collection.filter(key="testbranch_id")
    assert len(result) == 0
    result = ln.Collection.filter(key="testbranch_id", branch_id=1)
    assert len(result) == 0
    result = ln.Collection.filter(key="testbranch_id", branch_id=None)
    assert len(result) == 1

    # restore
    collection.restore()
    assert collection.branch_id == 1
    assert collection.ordered_artifacts[0].branch_id == 1

    # permanent delete
    collection.delete(permanent=True)
    result = ln.Artifact.filter(description="testbranch_id", branch_id=None)
    # also permanently deleted linked file
    assert len(result) == 1


================================================
FILE: tests/core/test_can_curate.py
================================================
import bionty as bt
import lamindb as ln
import pytest
from lamindb.errors import ValidationError


# some validate tests are in test_queryset
def test_inspect():
    ln.Schema.filter().delete(permanent=True)
    bt.Gene.filter().delete(permanent=True)
    result = bt.Gene.inspect("TCF7", "symbol", organism="human")
    assert result.validated == []

    bt.Gene.from_source(symbol="TCF7", organism="human").save()
    result = bt.Gene.inspect("TCF7", organism="human")
    assert bt.Gene.validate("TCF7", organism="human")
    result = bt.Gene.inspect(["TCF7", "ABC1"], "symbol", organism="human")
    assert result.validated == ["TCF7"]

    # clean up
    bt.Gene.filter().delete(permanent=True)


# if a record was added to the DB via a different source
# it will still be validated because it's in the DB
def test_inspect_source():
    source1 = bt.Source.get(entity="bionty.CellType", name="cl")
    source2 = bt.CellType.add_source(source="cl", version="2022-08-16")
    bt.CellType.from_source(name="T cell", source=source1).save()
    assert bt.CellType.inspect("T-cell", source=source2, mute=True).synonyms_mapper == {
        "T-cell": "T cell"
    }
    assert (
        bt.CellType.inspect(
            "T-cell", source=source2, mute=True, strict_source=True
        ).synonyms_mapper
        == {}
    )
    assert bt.CellType.validate("T cell", source=source2, mute=True).sum() == 1
    assert (
        bt.CellType.validate(
            "T cell", source=source2, mute=True, strict_source=True
        ).sum()
        == 0
    )
    assert bt.CellType.standardize("T-cell", source=source2, mute=True) == "T cell"
    # here still standardized because of bionty
    assert (
        bt.CellType.standardize("T-cell", source=source2, mute=True, strict_source=True)
        == "T cell"
    )
    bt.CellType.filter().delete(permanent=True)


def test_standardize():
    # synonym not in the database
    result = bt.Gene.standardize(["ABC1", "PDCD1"], organism="human")
    assert result == ["HEATR6", "PDCD1"]

    result = bt.Gene.standardize(
        ["ABC1", "PDCD1"], field=bt.Gene.symbol, organism="human"
    )
    assert result == ["HEATR6", "PDCD1"]

    mapper = bt.Gene.standardize(
        ["ABC1", "PDCD1"], return_mapper=True, organism="human"
    )
    assert mapper == {"ABC1": "HEATR6"}

    # synonym already in the database
    bt.Gene.from_source(symbol="LMNA", organism="human").save()
    mapper = bt.Gene.standardize(["ABC1", "LMN1"], return_mapper=True, organism="human")
    assert mapper == {"LMN1": "LMNA", "ABC1": "HEATR6"}
    assert bt.Gene.standardize(["LMNA"], organism="human") == ["LMNA"]
    assert bt.Gene.standardize("LMNA", organism="human") == "LMNA"
    assert bt.Gene.standardize(["LMN1"], return_mapper=True, organism="human") == {
        "LMN1": "LMNA"
    }


def test_standardize_from_source():
    result = bt.Gene.standardize(["ABC1", "PDCD1"], from_source=False)
    assert result == ["ABC1", "PDCD1"]


def test_add_remove_synonym():
    bt.CellType.filter().delete(permanent=True)

    # a registry that doesn't have a synonyms column
    user = ln.User.get(handle=ln.setup.settings.user.handle)
    with pytest.raises(NotImplementedError):
        user.add_synonym("syn")

    cell_types = bt.CellType.from_values(["T cell", "B cell"], "name")
    ln.save(cell_types)
    tcell = bt.CellType.get(name="T cell")
    bcell = bt.CellType.get(name="B cell")
    tcell.add_synonym(["my cell type"])
    tcell.add_synonym("")
    tcell.add_synonym([])
    assert "my cell type" in tcell.synonyms
    with pytest.raises(ValidationError):
        bcell.add_synonym("my cell type")
    with pytest.raises(ValidationError):
        tcell.add_synonym("my|celltype")

    tcell.remove_synonym("my cell type")
    assert "my cell type" not in tcell.synonyms

    bcell.synonyms = None
    bcell.save()
    tcell.synonyms = None
    tcell.save()
    tcell.add_synonym("")
    tcell.add_synonym([""])
    tcell.add_synonym([])
    tcell.add_synonym(["my cell type"])
    tcell.add_synonym("")
    tcell.add_synonym([""])
    tcell.add_synonym([])
    assert tcell.synonyms == "my cell type"
    tcell.remove_synonym("my cell type")

    # clean up
    bt.CellType.filter().delete(permanent=True)


def test_set_abbr():
    bt.CellType.filter().delete(permanent=True)
    bt.CellType(name="my cell type").save()
    record = bt.CellType.get(name="my cell type")
    # if abbr is name, do not add to synonyms
    record.set_abbr("my cell type")
    assert record.abbr == "my cell type"
    assert record.synonyms is None

    record.set_abbr("myct")
    assert record.abbr == "myct"
    assert "myct" in record.synonyms

    source = bt.Source.filter(organism="human").first()
    with pytest.raises(AttributeError) as error:
        source.set_abbr("abbr")
    assert (
        error.exconly() == "AttributeError: 'Source' object has no attribute 'set_abbr'"
    )

    record.delete()


def test_validate_int():
    result = ln.User.validate([1, 2, 3], field=ln.User.id)
    assert result.sum() == 1


def test_synonym_mapping():
    # only name field can be standardized
    bt.Gene.from_source(symbol="TNFRSF4", organism="human").save()

    result = bt.Gene.inspect(
        ["CD134", "TNFRSF4"], field=bt.Gene.symbol, organism="human"
    )
    assert result.synonyms_mapper == {"CD134": "TNFRSF4"}

    result = bt.Gene.inspect(
        ["CD134", "TNFRSF4"], field=bt.Gene.ensembl_gene_id, organism="human"
    )
    assert result.synonyms_mapper == {}

    bt.Gene.filter().delete(permanent=True)


def test_validate_called_on_object_raises_error():
    """Calling validate() on an object must raise TypeError."""
    label = ln.ULabel(name="test_label").save()
    with pytest.raises(TypeError) as error:
        label.validate(["test_value"])
    assert (
        "ULabel.validate() is a class method and must be called on the ULabel class, not on a ULabel object"
        in str(error.value)
    )


def test_standardize_source():
    """When passing a specific source to standardize, any matched public records must come from the passed source."""
    # 'HANCESTRO:0006' in Hancestro 3.0 but 'HANCESTRO:0848' in later versions
    assert (
        bt.Ethnicity.standardize(
            ["South Asian"],
            field="name",
            return_field="ontology_id",
            source=bt.Source(
                entity="bionty.Ethnicity",
                version="3.0",
                name="hancestro",
                organism="human",
            ),
        )[0]
        == "HANCESTRO:0006"
    )


================================================
FILE: tests/core/test_collection.py
================================================
import re

import anndata as ad
import lamindb as ln
import numpy as np
import pandas as pd
import pytest
from lamindb.errors import FieldValidationError
from scipy.sparse import csc_matrix, csr_matrix


@pytest.fixture(scope="module")
def df():
    return pd.DataFrame({"feat1": [1, 2], "feat2": [3, 4]})


@pytest.fixture(scope="module")
def adata():
    return ad.AnnData(
        X=np.array([[1, 2, 3], [4, 5, 6]]),
        obs={"feat1": ["A", "B"]},
        var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]),
        obsm={"X_pca": np.array([[1, 2], [3, 4]])},
        raw={"X": np.array([[8, 9, 10, 11], [12, 13, 14, 15]])},
    )


@pytest.fixture(scope="module")
def adata2():
    return ad.AnnData(
        X=np.array([[1, 2, 5], [4, 5, 8]]),
        obs={"feat1": ["A", "B"]},
        var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]),
        obsm={"X_pca": np.array([[1, 2], [3, 4]])},
    )


def test_from_single_artifact(adata):
    features = ln.Feature.from_dataframe(adata.obs)
    validated = ln.Feature.validate(
        [feature.name for feature in features], field="name"
    )
    ln.save([feature for (feature, valid) in zip(features, validated) if valid])
    artifact = ln.Artifact.from_anndata(adata, description="My adata")
    if not artifact._state.adding:
        artifact.delete(permanent=True)  # make sure we get a fresh one
        artifact = ln.Artifact.from_anndata(adata, description="My adata")
    with pytest.raises(ValueError) as error:
        ln.Collection(artifact, key="Test")
    assert str(error.exconly()).startswith(
        "ValueError: Not all artifacts are yet saved, please save them"
    )
    artifact.save()
    with pytest.raises(ValueError) as error:
        ln.Collection(artifact, artifact)
    assert str(error.exconly()).startswith(
        "ValueError: Only one non-keyword arg allowed: artifacts"
    )
    transform = ln.Transform(key="My test transform").save()
    run = ln.Run(transform).save()
    collection = ln.Collection(artifact, key="My new collection", run=run).save()
    assert collection.run.input_artifacts.get() == artifact
    collection.delete(permanent=True)
    artifact.delete(permanent=True)
    assert ln.Artifact.filter(id=artifact.id).one_or_none() is None


def test_edge_cases(df, ccaplog):
    with pytest.raises(
        FieldValidationError,
        match=re.escape(
            "Only artifacts, key, description, meta, reference, reference_type, run, revises, skip_hash_lookup can be passed"
        ),
    ) as error:
        ln.Collection(df, invalid_param=1)

    with pytest.raises(ValueError) as error:
        ln.Collection(1, key="Invalid")
    assert str(error.exconly()).startswith(
        "ValueError: Artifact or list[Artifact] is allowed."
    )

    artifact = ln.Artifact.from_dataframe(df, description="Test artifact")
    assert artifact._state.adding
    with pytest.raises(ValueError) as error:
        ln.Collection([artifact])
    assert str(error.exconly()).startswith(
        "ValueError: Not all artifacts are yet saved, please save them"
    )
    artifact.save()
    ln.Collection([artifact, artifact], key="test-collection")
    assert "your collection contains artifacts with non-unique hashes:" in ccaplog.text
    artifact.delete(permanent=True)


def test_from_inconsistent_artifacts(df, adata):
    artifact1 = ln.Artifact.from_dataframe(df, description="My test").save()
    artifact2 = ln.Artifact.from_anndata(adata, description="My test2").save()
    collection = ln.Collection([artifact1, artifact2], key="Inconsistent").save()
    # test idempotency of .save()
    collection.save()
    # create a run context
    ln.track(transform=ln.Transform(key="My test transform"))
    # can iterate over them
    collection.cache()
    assert set(ln.context.run.input_collections.all()) == {collection}
    # loading will throw an error here
    with pytest.raises(ValueError) as error:
        collection.load()
    assert str(error.exconly()).startswith(
        "ValueError: Can only load collections where all artifacts have the same suffix"
    )
    # test through query set
    with pytest.raises(ValueError) as error:
        collection.artifacts.all().load()
    assert str(error.exconly()).startswith(
        "ValueError: Can only load collections where all artifacts have the same suffix"
    )
    collection.describe()
    collection.delete(permanent=True)
    artifact1.delete(permanent=True)
    artifact2.delete(permanent=True)
    ln.context._run = None


def test_from_consistent_artifacts(adata, adata2):
    artifact1 = ln.Artifact.from_anndata(adata, key="my_test.h5ad").save()
    artifact2 = ln.Artifact.from_anndata(adata2, key="my_test.h5ad").save()
    transform = ln.Transform(key="My test transform").save()
    run = ln.Run(transform).save()
    initial_key = "My test"
    collection = ln.Collection([artifact1, artifact2], key=initial_key, run=run)
    assert collection._state.adding
    collection.save()
    assert set(collection.run.input_artifacts.all()) == {artifact1, artifact2}
    adata_joined = collection.load()
    assert "artifact_uid" in adata_joined.obs.columns
    assert artifact1.uid in adata_joined.obs.artifact_uid.cat.categories
    # test from query set through collection
    adata_joined = collection.artifacts.order_by("-created_at").load()
    assert "artifact_uid" in adata_joined.obs.columns
    assert artifact1.uid in adata_joined.obs.artifact_uid.cat.categories

    # re-run with hash-based lookup
    collection2 = ln.Collection([artifact1, artifact2], key="My test 1", run=run)
    assert collection2 == collection
    assert collection2.key == "My test 1"  # key is updated

    # skip hash lookup
    collection2 = ln.Collection(
        [artifact1, artifact2], key="My test 1", run=run, skip_hash_lookup=True
    )
    assert collection2 != collection

    # let hash uniqueness constraint fail and database return the existing record
    collection2 = ln.Collection(
        [artifact1, artifact2], key=initial_key, run=run, skip_hash_lookup=True
    ).save()
    assert collection2 == collection

    # move to trash and then re-run
    collection.delete()
    collection2 = ln.Collection([artifact1, artifact2], key="My test 2", run=run)
    assert collection2 != collection
    assert collection2.key == "My test 2"

    collection.delete(permanent=True)
    artifact1.delete(permanent=True)
    artifact2.delete(permanent=True)


def test_mapped(adata, adata2):
    # prepare test data
    adata.strings_to_categoricals()
    adata.obs["feat2"] = adata.obs["feat1"]
    adata.layers["layer1"] = adata.X.copy()
    adata.layers["layer1"][0, 0] = 0
    artifact1 = ln.Artifact.from_anndata(adata, key="part_one.h5ad").save()
    adata2.X = csr_matrix(adata2.X)
    adata2.layers["layer1"] = adata2.X.copy()
    adata2.obs["feat2"] = adata2.obs["feat1"]
    artifact2 = ln.Artifact.from_anndata(
        adata2, key="part_two.zarr", format="zarr"
    ).save()
    adata3 = adata2.copy()
    adata3.var_names = ["A", "B", "C"]
    adata3.obs.loc["0", "feat1"] = np.nan
    artifact3 = ln.Artifact.from_anndata(adata3, key="other_vars.h5ad").save()
    adata4 = adata.copy()
    adata4.layers["layer1"] = csc_matrix(adata4.layers["layer1"])
    artifact4 = ln.Artifact.from_anndata(adata4, description="csc layer").save()
    collection_outer = ln.Collection(
        [artifact1, artifact2, artifact3], key="gather_outer"
    ).save()
    collection_csc = ln.Collection([artifact4, artifact2], key="check_csc").save()
    collection = ln.Collection([artifact1, artifact2], key="gather")
    # test mapped without saving first
    with collection.mapped() as ls_ds:
        assert ls_ds.__class__.__name__ == "MappedCollection"
    collection.save()

    # test encoders
    with pytest.raises(ValueError):
        ls_ds = collection.mapped(encode_labels=["feat1"])
    with pytest.raises(ValueError):
        ls_ds = collection.mapped(obs_keys="feat1", encode_labels=["feat3"])
    with pytest.raises(ValueError):
        ls_ds = collection.mapped(obs_keys="feat1", unknown_label={"feat3": "Unknown"})
    with collection.mapped(obs_keys=["feat1", "feat2"], unknown_label="A") as ls_ds:
        assert ls_ds.encoders["feat1"]["A"] == -1
        assert ls_ds.encoders["feat1"]["B"] == 0
        assert ls_ds.encoders["feat2"]["A"] == -1
        assert ls_ds.encoders["feat2"]["B"] == 0
        assert ls_ds[0]["feat1"] == -1
        assert ls_ds[1]["feat1"] == 0
        assert ls_ds[0]["feat2"] == -1
        assert ls_ds[1]["feat2"] == 0
    with collection.mapped(
        obs_keys=["feat1", "feat2"], unknown_label={"feat1": "A"}
    ) as ls_ds:
        assert ls_ds.encoders["feat1"]["A"] == -1
        assert ls_ds.encoders["feat1"]["B"] == 0
        # categories in the encoder are sorted
        A_enc = ls_ds.encoders["feat2"]["A"]
        assert A_enc == 0
        B_enc = ls_ds.encoders["feat2"]["B"]
        assert B_enc == 1
        assert ls_ds[0]["feat1"] == -1
        assert ls_ds[1]["feat1"] == 0
        assert ls_ds[0]["feat2"] == A_enc
        assert ls_ds[1]["feat2"] == B_enc
    with collection.mapped(
        obs_keys=["feat1", "feat2"], unknown_label="A", encode_labels=["feat1"]
    ) as ls_ds:
        assert ls_ds.encoders["feat1"]["A"] == -1
        assert ls_ds.encoders["feat1"]["B"] == 0
        assert "feat2" not in ls_ds.encoders
        assert ls_ds[0]["feat1"] == -1
        assert ls_ds[1]["feat1"] == 0
        assert ls_ds[0]["feat2"] == "A"
        assert ls_ds[1]["feat2"] == "B"

    ls_ds = collection.mapped(obs_keys="feat1")
    assert not ls_ds.closed

    assert len(ls_ds) == 4
    assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3
    assert len(ls_ds[0]["X"]) == 3
    assert np.array_equal(ls_ds[2]["X"], np.array([1, 2, 5]))
    weights = ls_ds.get_label_weights("feat1")
    assert len(weights) == 4
    assert all(weights == 0.5)
    weights = ls_ds.get_label_weights(["feat1", "feat2"])
    assert len(weights) == 4
    assert all(weights == 0.5)
    weights = ls_ds.get_label_weights(["feat1", "feat2"], scaler=1.0)
    assert all(weights == 1.0 / 3.0)
    weights = ls_ds.get_label_weights(
        ["feat1", "feat2"], scaler=1.0, return_categories=True
    )
    assert weights["A__A"] == 1.0 / 3.0
    assert weights["B__B"] == 1.0 / 3.0

    assert not ls_ds.check_vars_sorted(ascending=True)
    assert not ls_ds.check_vars_sorted(ascending=False)
    assert ls_ds.check_vars_non_aligned(["MYC", "TCF7", "GATA1"]) == []
    ls_ds.var_list = None
    assert not ls_ds.check_vars_sorted()
    ls_ds.var_list = None
    assert ls_ds.check_vars_non_aligned(["MYC", "TCF7", "GATA1"]) == []

    ls_ds.close()
    assert ls_ds.closed
    del ls_ds

    with collection.mapped(obs_keys="feat1", join="inner", dtype="float32") as ls_ds:
        assert not ls_ds.closed
        assert len(ls_ds) == 4
        assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3
        assert str(ls_ds[0]["X"].dtype) == "float32"
        assert str(ls_ds[2]["X"].dtype) == "float32"
    assert ls_ds.closed

    ls_ds = collection.mapped(obs_keys="feat1", parallel=True)
    assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3
    assert ls_ds[0]["_store_idx"] == 0
    assert ls_ds[2]["_store_idx"] == 1

    ls_ds = collection.mapped(
        layers_keys=["layer1"], obsm_keys=["X_pca"], obs_keys="feat1"
    )
    assert np.array_equal(ls_ds[0]["layer1"], np.array([0, 2, 3]))
    assert np.array_equal(ls_ds[2]["layer1"], np.array([1, 2, 5]))
    assert np.array_equal(ls_ds[2]["obsm_X_pca"], np.array([1, 2]))
    assert np.array_equal(ls_ds[3]["obsm_X_pca"], np.array([3, 4]))
    assert ls_ds.shape == (4, 3)
    assert ls_ds.original_shapes[0] == (2, 3) and ls_ds.original_shapes[1] == (2, 3)
    ls_ds.close()
    # keys not present in a store are ignored (omitted from output)
    with collection.mapped(
        obs_keys=["feat1", "feat_missing"],
        obsm_keys=["X_pca", "X_missing"],
        layers_keys=["X", "raw.X"],
    ) as ls_ds:
        assert len(ls_ds) == 4
        ls_ds_idx = ls_ds[0]
        assert ls_ds_idx["X"].shape == (3,)
        assert ls_ds_idx["raw.X"].shape == (4,)
        assert "feat1" in ls_ds_idx
        assert "feat_missing" not in ls_ds_idx
        assert "obsm_X_pca" in ls_ds_idx
        assert "obsm_X_missing" not in ls_ds_idx
        assert "raw.X" not in ls_ds[2]
    # test with QuerySet
    query_set = ln.Artifact.filter(key__in=["part_one.h5ad", "part_two.zarr"])
    with query_set.mapped() as ls_ds:
        assert ls_ds.shape == (4, 3)
    with query_set.order_by("created_at").mapped(stream=True) as ls_ds:
        assert ls_ds.shape == (4, 3)

    with collection.mapped(obs_keys="feat1", stream=True) as ls_ds:
        assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3

    with pytest.raises(ValueError):
        with collection_outer.mapped(obs_keys="feat1", join="inner"):
            pass

    with collection_outer.mapped(
        layers_keys="X", obsm_keys="X_pca", obs_keys="feat1", join="outer"
    ) as ls_ds:
        assert ls_ds.shape == (6, 6)
        assert ls_ds.join_vars == "outer"
        assert len(ls_ds.var_joint) == 6
        assert len(ls_ds[0]) == 4
        assert len(ls_ds[0]["X"]) == 6
        assert np.array_equal(ls_ds[0]["X"], np.array([0, 0, 0, 3, 1, 2]))
        assert np.array_equal(ls_ds[1]["X"], np.array([0, 0, 0, 6, 4, 5]))
        assert np.array_equal(ls_ds[2]["X"], np.array([0, 0, 0, 5, 1, 2]))
        assert np.array_equal(ls_ds[3]["X"], np.array([0, 0, 0, 8, 4, 5]))
        ls_ds_idx = ls_ds[4]
        assert np.array_equal(ls_ds_idx["X"], np.array([1, 2, 5, 0, 0, 0]))
        assert ls_ds_idx["feat1"] is np.nan
        assert np.array_equal(ls_ds[5]["X"], np.array([4, 5, 8, 0, 0, 0]))
        assert np.issubdtype(ls_ds[2]["X"].dtype, np.integer)
        assert np.issubdtype(ls_ds[4]["X"].dtype, np.integer)
        assert np.array_equal(ls_ds[3]["obsm_X_pca"], np.array([3, 4]))
        assert ls_ds.check_vars_non_aligned(["MYC", "TCF7", "GATA1"]) == [2]
        assert not ls_ds.check_vars_sorted()
        assert len(ls_ds.get_label_weights("feat1")) == 6

    with collection_outer.mapped(layers_keys="layer1", join="outer") as ls_ds:
        assert np.array_equal(ls_ds[0]["layer1"], np.array([0, 0, 0, 3, 0, 2]))
        assert np.array_equal(ls_ds[4]["layer1"], np.array([1, 2, 5, 0, 0, 0]))

    # csc matrix in layers
    with pytest.raises(ValueError):
        collection_csc.mapped(layers_keys="layer1")

    # test with obs_filter
    # tuple as obs_filter is deprecated, test anyways for now
    with collection.mapped(obs_filter=("feat1", ("A", "B"))) as ls_ds:
        assert ls_ds.shape == (4, 3)
        assert np.array_equal(ls_ds[1]["X"], np.array([4, 5, 6]))
        assert np.array_equal(ls_ds[3]["X"], np.array([4, 5, 8]))
        weights = ls_ds.get_label_weights("feat1")
        assert len(weights) == 4
        assert all(weights == 0.5)
    # tuple as obs_filter is deprecated, test anyways for now
    with collection.mapped(obs_filter=("feat1", "B")) as ls_ds:
        assert ls_ds.shape == (2, 3)
        assert np.array_equal(ls_ds[0]["X"], np.array([4, 5, 6]))
        assert np.array_equal(ls_ds[1]["X"], np.array([4, 5, 8]))
        weights = ls_ds.get_label_weights("feat2")
        assert len(weights) == 2
        assert all(weights == 0.5)

    with collection.mapped(obs_filter={"feat1": "B", "feat2": ("A", "B")}) as ls_ds:
        assert ls_ds.shape == (2, 3)
        assert ls_ds.original_shapes == [(1, 3), (1, 3)]
        assert np.array_equal(ls_ds[0]["X"], np.array([4, 5, 6]))
        assert np.array_equal(ls_ds[1]["X"], np.array([4, 5, 8]))
        weights = ls_ds.get_label_weights("feat2")
        assert len(weights) == 2
        assert all(weights == 0.5)
    # nan in filtering values
    with collection_outer.mapped(obs_filter={"feat1": np.nan}, join="outer") as ls_ds:
        assert ls_ds.shape == (1, 6)
        assert np.array_equal(ls_ds[0]["X"], np.array([1, 2, 5, 0, 0, 0]))
    with collection_outer.mapped(
        obs_filter={"feat1": (np.nan,), "feat2": ["A", "B"]}, join="outer"
    ) as ls_ds:
        assert ls_ds.shape == (1, 6)
    with collection_outer.mapped(
        obs_filter={"feat1": (np.nan, "A", "B")}, join="outer"
    ) as ls_ds:
        assert ls_ds.shape == (6, 6)
    with collection_outer.mapped(
        obs_filter={"feat1": ["A", "B"]}, join="outer"
    ) as ls_ds:
        assert ls_ds.shape == (5, 6)
    with collection_outer.mapped(
        obs_filter={"feat1": ("A", np.nan)}, join="outer"
    ) as ls_ds:
        assert ls_ds.shape == (3, 6)

    collection.delete(permanent=True)
    collection_outer.delete(permanent=True)
    collection_csc.delete(permanent=True)
    artifact1.delete(permanent=True)
    artifact2.delete(permanent=True)
    artifact3.delete(permanent=True)
    artifact4.delete(permanent=True)


def test_revise_collection(df, adata):
    # create a versioned collection
    artifact = ln.Artifact.from_dataframe(df, description="test").save()
    collection = ln.Collection(artifact, key="test-collection", version="1")
    assert collection.version_tag == "1"
    assert collection.version == "1"
    assert collection.uid.endswith("0000")
    collection.save()

    artifact = ln.Artifact.from_anndata(adata, description="test").save()

    with pytest.raises(ValueError) as error:
        collection_r2 = ln.Collection(artifact, revises=collection, version="1")
    assert (
        error.exconly()
        == "ValueError: Please change the version tag or leave it `None`, '1' is already taken"
    )

    with pytest.raises(TypeError):
        ln.Collection(adata, revises="wrong-type")

    # create new collection from old collection
    collection_r2 = ln.Collection(artifact, key="test-collection")
    assert collection_r2.stem_uid == collection.stem_uid
    assert collection_r2.uid.endswith("0001")
    # repeat
    collection_r2 = ln.Collection(artifact, key="test-collection")
    assert collection_r2.stem_uid == collection.stem_uid
    assert collection_r2.uid.endswith("0001")
    assert collection_r2.version_tag is None
    assert (
        collection_r2.version == collection_r2.uid[-4:]
    )  # version falls back to uid suffix
    assert collection_r2.key == "test-collection"

    collection_r2.save()

    # create new collection from newly versioned collection
    df.iloc[0, 0] = 0
    artifact = ln.Artifact.from_dataframe(df, description="test")
    artifact.save()
    collection_r3 = ln.Collection(
        artifact,
        key="test-collection",
        description="test description3",
        version="2",
    )
    assert collection_r3.stem_uid == collection.stem_uid
    assert collection_r3.version_tag == "2"
    assert collection_r3.version == "2"
    assert collection_r3.uid.endswith("0002")
    assert collection_r3.key == "test-collection"
    assert collection_r3.description == "test description3"

    artifacts_r2 = collection_r2.artifacts.all()
    collection_r2.delete(permanent=True)
    artifacts_r2.delete(permanent=True)
    artifacts = collection.artifacts.all()
    collection.delete(permanent=True)
    artifacts.delete(permanent=True)


def test_collection_append(df, adata):
    artifact = ln.Artifact.from_dataframe(df, description="test").save()
    artifact_1 = ln.Artifact.from_anndata(adata, description="test").save()
    collection = ln.Collection(artifact, key="Test", description="Test append").save()
    new_collection = collection.append(artifact_1).save()

    assert new_collection.key == collection.key
    assert new_collection.description == collection.description
    assert new_collection.uid.endswith("0001")
    artifacts = new_collection.artifacts.all()
    assert len(artifacts) == 2

    new_collection.versions.delete(permanent=True)
    artifacts.delete(permanent=True)


def test_with_metadata(df, adata):
    meta_artifact = ln.Artifact.from_dataframe(df, description="test")
    meta_artifact.save()
    data_artifact = ln.Artifact.from_anndata(adata, description="test adata")
    data_artifact.save()
    collection = ln.Collection(
        data_artifact, key="test collection", meta_artifact=meta_artifact
    )
    collection.save()

    assert collection.meta_artifact == meta_artifact
    assert collection.data_artifact == data_artifact
    collection.delete(permanent=True)
    data_artifact.delete(permanent=True)
    meta_artifact.delete(permanent=True)


def test_collection_get_tracking(df):
    artifact = ln.Artifact.from_dataframe(df, key="df.parquet").save()
    collection = ln.Collection(artifact, key="track-collection").save()

    transform = ln.Transform(key="test track collection via get").save()
    run = ln.Run(transform).save()

    assert (
        ln.Collection.get(key="track-collection", is_run_input=run)
        in run.input_collections.all()
    )

    collection.delete(permanent=True)
    artifact.delete(permanent=True)
    transform.delete(permanent=True)


def test_describe_collection(adata, capsys):
    artifact = ln.Artifact(adata, description="test").save()
    collection = ln.Collection(artifact, key="test").save()
    collection.describe()
    captured = capsys.readouterr()
    assert len(captured.out) > 50
    assert "collection" in captured.out.lower()

    # test describing from a remote postgres instance with less modules
    collection = ln.Collection.connect("laminlabs/lamin-dev").first()
    collection.describe()
    captured = capsys.readouterr()
    assert len(captured.out) > 50
    assert "collection" in captured.out.lower()


================================================
FILE: tests/core/test_curator_basics.py
================================================
import re
import textwrap

import bionty as bt
import lamindb as ln
import pandas as pd
import pytest
from lamindb.core.exceptions import ValidationError


def _strip_ansi(text: str) -> str:
    """Remove ANSI escape sequences from a string."""
    ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
    return ansi_escape.sub("", text)


@pytest.fixture
def df() -> pd.DataFrame:
    return pd.DataFrame(
        {
            "sample_id": ["sample1", "sample2"],
            "sample_name": ["Sample 1", "Sample 2"],
            "sample_type": ["Type A", "Type B"],
        }
    )


@pytest.fixture
def df_missing_sample_type_column() -> pd.DataFrame:
    return pd.DataFrame(
        {
            "sample_id": ["sample1", "sample2"],
            "sample_name": ["Sample 1", "Sample 2"],
        }
    )


@pytest.fixture
def df_missing_sample_name_column() -> pd.DataFrame:
    return pd.DataFrame(
        {
            "sample_id": ["sample1", "sample2"],
            "sample_type": ["Type A", "Type B"],
        }
    )


@pytest.fixture
def df_changed_col_order() -> pd.DataFrame:
    return pd.DataFrame(
        {
            "sample_name": ["Sample 1", "Sample 2"],
            "sample_type": ["Type A", "Type B"],
            "sample_id": ["sample1", "sample2"],
        }
    )


@pytest.fixture
def df_extra_column() -> pd.DataFrame:
    return pd.DataFrame(
        {
            "sample_id": ["sample1", "sample2"],
            "sample_name": ["Sample 1", "Sample 2"],
            "sample_type": ["Type A", "Type B"],
            "extra_column": ["Extra 1", "Extra 2"],
        }
    )


@pytest.fixture
def df_disease() -> pd.DataFrame:
    return pd.DataFrame(
        {
            "disease": pd.Categorical(
                [
                    # Only after 2025 mondo
                    "HDAC4-related haploinsufficiency syndrome",
                    "SAMD9L-related spectrum and myeloid neoplasm risk",
                    # Already before 2025 mondo
                    "essential hypertension",
                    "essential hypertension",
                    "asthma",
                ]
            ),
        }
    )


@pytest.fixture
def disease_ontology_old() -> bt.Source:
    return bt.Disease.add_source(
        bt.Source.connect("laminlabs/bionty-assets")
        .get(entity="bionty.Disease", version="2024-08-06", organism="all")
        .save()
    )


@pytest.fixture(scope="module")
def lists_df():
    return pd.DataFrame(
        {
            "sample_id": [["sample1", "sample2"], ["sample2"], ["sample3"]],
            "dose": [[1.2, 2.3], [1.2], [2.3]],
            "cell_type": [["B cell", "T cell"], ["B cell"], ["T cell"]],
            "tissue": [["blood", "pulmo"], ["blood"], ["lung"]],
        }
    )


@pytest.fixture(scope="module")
def cat_df():
    return pd.DataFrame(
        {
            "sample_id": [["sample1", "sample2"], ["sample2"], ["sample3"]],
            "dose": [[1.2, 2.3], [1.2], [2.3]],
            "cell_type": [["B cell", "T cell"], ["B cell"], ["T cell"]],
            "tissue": ["blood", "blood", "lung"],
        }
    )


def test_curator_df_multivalue(lists_df, cat_df):
    feature1 = ln.Feature(name="sample_id", dtype=list[str]).save()
    feature2 = ln.Feature(name="dose", dtype=list[float]).save()
    feature3 = ln.Feature(name="cell_type", dtype=list[str]).save()
    feature4 = ln.Feature(name="tissue", dtype=list[bt.Tissue]).save()
    schema = ln.Schema(
        name="lists schema cat",
        features=[
            feature1,
            feature2,
            feature3,
            feature4,
        ],
    ).save()

    curator = ln.curators.DataFrameCurator(lists_df, schema)
    with pytest.raises(ValidationError):
        curator.validate()
    assert curator.cat._cat_vectors.keys() == {"columns", "tissue"}
    assert curator.cat._cat_vectors["tissue"]._validated == ["blood", "lung"]
    assert curator.cat._cat_vectors["tissue"]._non_validated == ["pulmo"]
    assert curator.cat._cat_vectors["tissue"]._synonyms == {"pulmo": "lung"}

    curator.cat.standardize("tissue")
    assert curator.cat._cat_vectors["tissue"]._non_validated == []
    assert lists_df["tissue"].tolist() == [["blood", "lung"], ["blood"], ["lung"]]

    assert curator.validate() is None

    # test with cat_df which has a non-list tissue
    curator = ln.curators.DataFrameCurator(cat_df, schema)
    with pytest.raises(ValidationError):
        curator.validate()

    schema.delete(permanent=True)
    feature1.delete(permanent=True)
    feature2.delete(permanent=True)
    feature3.delete(permanent=True)
    feature4.delete(permanent=True)


def test_curators_list_feature_nullable_empty_list():
    """Test that a list feature that is nullable can accept empty lists."""
    feature_list = ln.Feature(
        name="list_tissue", dtype=list[bt.Tissue.ontology_id], nullable=True
    ).save()
    feature_int = ln.Feature(name="feature int", dtype=int, nullable=True).save()
    schema = ln.Schema(
        name="test_list_feature_schema",
        features=[feature_list, feature_int],
        coerce=True,
    ).save()

    df = pd.DataFrame({"list_tissue": [], "feature int": []})
    ln.curators.DataFrameCurator(df, schema).validate()

    # clean up
    schema.delete(permanent=True)
    feature_list.delete(permanent=True)
    feature_int.delete(permanent=True)


def test_curator__repr__(df):
    feature = ln.Feature(name="sample_id", dtype="str").save()
    schema = ln.Schema(
        name="sample schema",
        features=[feature],
    ).save()
    curator = ln.curators.DataFrameCurator(df, schema)

    expected_repr = textwrap.dedent("""\
    DataFrameCurator(Schema: sample schema, unvalidated)
    """).strip()

    actual_repr = _strip_ansi(repr(curator))
    print(actual_repr)
    assert actual_repr.strip() == expected_repr.strip()

    schema.delete(permanent=True)
    feature.delete(permanent=True)


@pytest.mark.parametrize(
    "model_class",
    [ln.ULabel, ln.Record],
)
def test_df_curator_typed_categorical(model_class):
    # root level
    sample_root_type = model_class(name="Sample", is_type=True).save()
    for name in ["s1", "s2"]:
        model_class(name=name, type=sample_root_type).save()

    # lab A level
    lab_a_type = model_class(name="LabA", is_type=True).save()
    sample_a_type = model_class(name="Sample", is_type=True, type=lab_a_type).save()
    for name in ["s3", "s4"]:
        model_class(name=name, type=sample_a_type).save()

    # lab B level
    lab_b_type = model_class(name="LabB", is_type=True).save()
    sample_b_type = model_class(name="Sample", is_type=True, type=lab_b_type).save()
    for name in ["s5", "s6"]:
        model_class(name=name, type=sample_b_type).save()

    df = pd.DataFrame(
        {
            "biosample_name": pd.Categorical(["s1", "s2", "s3", "s4", "s5", "s6"]),
        }
    )

    feature = ln.Feature(name="biosample_name", dtype=sample_a_type).save()
    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert "4 terms not validated in feature 'biosample_name':" in error.exconly()
    assert set(curator.cat._cat_vectors["biosample_name"]._validated) == {
        "s3",
        "s4",
    }
    assert set(curator.cat._cat_vectors["biosample_name"]._non_validated) == {
        "s1",
        "s2",
        "s5",
        "s6",
    }

    # Move LabB under LabA
    lab_b_type.type = lab_a_type
    lab_b_type.save()
    feature.delete(permanent=True)  # re-create the feature with the new dtype
    feature = ln.Feature(name="biosample_name", dtype=lab_a_type).save()
    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert set(curator.cat._cat_vectors["biosample_name"]._validated) == {
        "s3",
        "s4",
        "s5",
        "s6",
    }
    assert set(curator.cat._cat_vectors["biosample_name"]._non_validated) == {
        "s1",
        "s2",
    }

    # Lab at the root
    feature.delete(permanent=True)  # re-create the feature with the new dtype
    feature = ln.Feature(name="biosample_name", dtype=sample_root_type).save()
    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert set(curator.cat._cat_vectors["biosample_name"]._validated) == {
        "s1",
        "s2",
    }
    assert set(curator.cat._cat_vectors["biosample_name"]._non_validated) == {
        "s3",
        "s4",
        "s5",
        "s6",
    }

    attribute = model_class.__name__.lower() + "s"
    getattr(sample_a_type, attribute).all().delete(permanent=True)
    getattr(sample_b_type, attribute).all().delete(permanent=True)
    getattr(lab_b_type, attribute).all().delete(permanent=True)
    getattr(lab_a_type, attribute).all().delete(permanent=True)
    lab_a_type.delete(permanent=True)
    lab_b_type.delete(permanent=True)
    getattr(sample_root_type, attribute).all().delete(permanent=True)
    sample_root_type.delete(permanent=True)
    feature.delete(permanent=True)


def test_df_curator_same_name_at_different_levels_involving_root():
    s1_root = ln.Record(name="s1").save()
    lab_a_type = ln.Record(name="LabA", is_type=True).save()
    s1_lab_a = ln.Record(name="s1", type=lab_a_type).save()
    df = pd.DataFrame({"biosample_name": pd.Categorical(["s1"])})

    # feature constraining to lab_a_type
    feature = ln.Feature(name="biosample_name", dtype=lab_a_type).save()
    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())
    curator.validate()
    cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"]
    assert cat_vector._validated == ["s1"]
    assert len(cat_vector.records) == 1
    assert cat_vector.records[0] == s1_lab_a

    # feature constraining to root
    feature.delete(permanent=True)
    feature = ln.Feature(name="biosample_name", dtype=ln.Record).save()
    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())
    curator.validate()
    cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"]
    assert cat_vector._validated == ["s1"]
    assert len(cat_vector.records) == 1
    assert cat_vector.records[0] == s1_root

    feature.delete(permanent=True)
    s1_root.delete(permanent=True)
    s1_lab_a.delete(permanent=True)
    lab_a_type.delete(permanent=True)


def test_df_curator_same_name_at_different_levels_below_root():
    department_a_type = ln.Record(name="DepartmentA", is_type=True).save()
    s1_department_a = ln.Record(name="s1", type=department_a_type).save()
    lab_a_type = ln.Record(name="LabA", is_type=True, type=department_a_type).save()
    s1_lab_a = ln.Record(name="s1", type=lab_a_type).save()
    df = pd.DataFrame({"biosample_name": pd.Categorical(["s1"])})

    # feature constraining to lab_a_type
    feature = ln.Feature(name="biosample_name", dtype=lab_a_type).save()
    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())
    curator.validate()
    cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"]
    assert cat_vector._validated == ["s1"]
    assert len(cat_vector.records) == 1
    assert cat_vector.records[0] == s1_lab_a

    # feature constraining to department_a_type
    feature.delete(permanent=True)
    feature = ln.Feature(name="biosample_name", dtype=department_a_type).save()
    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())
    curator.validate()
    cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"]
    assert cat_vector._validated == ["s1"]
    assert len(cat_vector.records) == 1
    assert cat_vector.records[0] == s1_department_a

    feature.delete(permanent=True)
    s1_department_a.delete(permanent=True)
    s1_lab_a.delete(permanent=True)
    lab_a_type.delete(permanent=True)
    department_a_type.delete(permanent=True)


def test_df_curator_same_name_at_same_level():
    # below root level
    lab_a_type = ln.Record(name="LabA", is_type=True).save()
    record_1 = ln.Record(name="s1", type=lab_a_type).save()
    lab_b_type = ln.Record(name="LabB", is_type=True).save()
    record_2 = ln.Record(name="s1", type=lab_b_type).save()
    df = pd.DataFrame({"biosample_name": pd.Categorical(["s1"])})
    feature = ln.Feature(name="biosample_name", dtype=ln.Record).save()
    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert (
        "Ambiguous match for Record 's1': found 2 records at depth 1 (under types: ['LabA', 'LabB'])"
        in error.exconly()
    )

    # at root level
    record_1.type = None
    record_1.save()
    record_2.type = None
    record_2.save()
    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert (
        "Ambiguous match for Record 's1': found 2 root-level records" in error.exconly()
    )

    feature.delete(permanent=True)
    record_1.delete(permanent=True)
    lab_a_type.delete(permanent=True)
    record_2.delete(permanent=True)
    lab_b_type.delete(permanent=True)


# also see test_features_name_duplicates_across_equal_levels
def test_curator_schema_feature_mapping():
    lab_a_type = ln.Feature(name="LabA", is_type=True).save()
    feature1 = ln.Feature(name="sample_name", dtype="str", type=lab_a_type).save()
    lab_b_type = ln.Feature(name="LabB", is_type=True).save()
    feature2 = ln.Feature(name="sample_name", dtype="str", type=lab_b_type).save()
    schema = ln.Schema([feature1], name="Lab A schema").save()
    df = pd.DataFrame({"sample_name": ["s1", "s2"]})
    curator = ln.curators.DataFrameCurator(df, schema)
    curator.validate()
    cat_vector = curator._atomic_curator.cat._cat_vectors["columns"]
    assert len(cat_vector.records) == 1
    assert len(cat_vector._validated) == 1
    schema.delete(permanent=True)
    feature1.delete(permanent=True)
    feature2.delete(permanent=True)
    lab_a_type.delete(permanent=True)
    lab_b_type.delete(permanent=True)


def test_dtypes_at_different_levels(ccaplog):
    sample_type_root = ln.Record(name="Sample", is_type=True).save()
    lab_a_type = ln.Record(name="LabA", is_type=True).save()
    sample_type_a = ln.Record(name="Sample", is_type=True, type=lab_a_type).save()
    s1_lab_a = ln.Record(name="s1", type=sample_type_a).save()
    df = pd.DataFrame({"biosample_name": pd.Categorical(["s1"])})
    feature = ln.Feature(name="biosample_name", dtype=sample_type_root).save()
    schema = ln.Schema(features=[feature]).save()
    sample_type_root.delete()
    df = pd.DataFrame({"biosample_name": pd.Categorical(["s1"])})
    # UID-based lookup can find records in trash, so curator creation should succeed
    # but a warning should be printed
    curator = ln.curators.DataFrameCurator(df, schema)
    assert "from trash" in ccaplog.text
    schema.delete(permanent=True)
    sample_type_root.restore()
    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert "1 term not validated in feature 'biosample_name': 's1'" in error.exconly()
    s1_root = ln.Record(name="s1", type=sample_type_root).save()
    curator.validate()
    cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"]
    assert cat_vector._validated == ["s1"]
    assert len(cat_vector.records) == 1
    assert cat_vector.records[0] == s1_root
    # update feature dtype
    feature.delete(permanent=True)
    feature = ln.Feature(name="biosample_name", dtype=sample_type_a).save()
    curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features())
    curator.validate()
    cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"]
    assert cat_vector._validated == ["s1"]
    assert len(cat_vector.records) == 1
    assert cat_vector.records[0] == s1_lab_a
    feature.delete(permanent=True)
    s1_lab_a.delete(permanent=True)
    sample_type_a.delete(permanent=True)
    lab_a_type.delete(permanent=True)
    s1_root.delete(permanent=True)
    sample_type_root.delete(permanent=True)


def test_nullable():
    disease = ln.Feature(name="disease", dtype=ln.ULabel, nullable=False).save()
    schema = ln.Schema(features=[disease]).save()
    dataset = {"disease": pd.Categorical([pd.NA, "asthma"])}
    df = pd.DataFrame(dataset)
    curator = ln.curators.DataFrameCurator(df, schema)
    with pytest.raises(ln.errors.ValidationError) as err:
        curator.validate()
    assert "non-nullable series 'disease' contains null values" in err.exconly()
    # make feature nullable
    # (needs to throw an error if already datasets were validated with it)
    disease.nullable = True
    disease.save()
    curator = ln.curators.DataFrameCurator(df, schema)
    with pytest.raises(
        ValidationError,
        # match=re.escape("1 term is not validated: 'asthma'"),  # TODO: need the message
    ):
        curator.validate()

    schema.delete(permanent=True)
    disease.delete(permanent=True)


def test_pandera_dataframe_schema(
    df,
    df_missing_sample_type_column,
    df_changed_col_order,
    df_extra_column,
    df_missing_sample_name_column,
):
    # schemas
    schema_all_required = ln.Schema(
        name="my-schema all required",
        features=[
            ln.Feature(name="sample_id", dtype=str).save(),
            ln.Feature(name="sample_name", dtype=str).save(),
            ln.Feature(name="sample_type", dtype=str).save(),
        ],
    ).save()
    schema_maximal_set = ln.Schema(
        name="my-schema maximal_set",
        features=[
            ln.Feature(name="sample_id", dtype=str).save(),
            ln.Feature(name="sample_name", dtype=str).save(),
            ln.Feature(name="sample_type", dtype=str).save(),
        ],
        minimal_set=False,
        maximal_set=True,
    ).save()
    schema_ordered_set = ln.Schema(
        name="my-schema ordered_set",
        features=[
            ln.Feature(name="sample_id", dtype=str).save(),
            ln.Feature(name="sample_name", dtype=str).save(),
            ln.Feature(name="sample_type", dtype=str).save(),
        ],
        ordered_set=True,
    ).save()

    # minimal_set=True, all three columns are required
    ln.curators.DataFrameCurator(df, schema=schema_all_required).validate()
    # can't miss a required column
    with pytest.raises(ValidationError):
        ln.curators.DataFrameCurator(
            df_missing_sample_type_column, schema=schema_all_required
        ).validate()
    # doesn't care about order
    ln.curators.DataFrameCurator(
        df_changed_col_order, schema=schema_all_required
    ).validate()
    # extra column is fine
    ln.curators.DataFrameCurator(df_extra_column, schema=schema_all_required).validate()

    # maximal_set=True, extra column is *not* allowed
    # check that __lamindb values are OK
    df["__lamindb_record_uid__"] = "some_value"
    ln.curators.DataFrameCurator(df, schema=schema_maximal_set).validate()
    del df["__lamindb_record_uid__"]
    with pytest.raises(ValidationError):
        ln.curators.DataFrameCurator(
            df_extra_column,
            schema=schema_maximal_set,  # extra column is not allowed
        ).validate()
    # minimal_set=False, missing column is allowed
    ln.curators.DataFrameCurator(
        df_missing_sample_type_column, schema=schema_maximal_set
    ).validate()

    # ordered_set=True, order matters
    with pytest.raises(ValidationError):
        ln.curators.DataFrameCurator(
            df_changed_col_order, schema=schema_ordered_set
        ).validate()

    # a feature is optional
    schema_optional_sample_name = ln.Schema(
        name="my-schema optional sample_name",
        features=[
            ln.Feature(name="sample_id", dtype=str).save(),
            ln.Feature(name="sample_name", dtype=str).save().with_config(optional=True),
            ln.Feature(name="sample_type", dtype=str).save(),
        ],
    ).save()
    # missing required "sample_type" column raises an error
    with pytest.raises(ValidationError):
        ln.curators.DataFrameCurator(
            df_missing_sample_type_column,
            schema=schema_optional_sample_name,
        ).validate()
    # missing optional column "sample_name" is fine
    ln.curators.DataFrameCurator(
        df_missing_sample_name_column, schema=schema_optional_sample_name
    ).validate()

    # clean up
    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


def test_schema_not_saved(df):
    """Attempting to validate an unsaved Schema must error."""
    feature = ln.Feature(name="cell_type", dtype=str).save()
    schema = ln.Schema(features=[feature])

    with pytest.raises(ValueError) as excinfo:
        ln.curators.DataFrameCurator(df, schema)
    assert excinfo.exconly() == (
        "ValueError: Schema must be saved before curation. Please save it using '.save()'."
    )


def test_schema_artifact_annotated(df):
    """A passed Artifact should be annotated with a Schema if successfully curated."""
    af = ln.Artifact.from_dataframe(df, key="test.parquet").save()
    schema = ln.Schema(
        name="sample schema",
        features=[ln.Feature(name="sample_id", dtype="str").save()],
    ).save()
    curator = ln.curators.DataFrameCurator(af, schema)
    curator.validate()
    curator.save_artifact()
    af_queried = ln.Artifact.filter(key="test.parquet").one()
    assert af_queried.schema is not None

    # clean up
    af.delete(permanent=True)
    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


def test_schema_optionals():
    schema = ln.Schema(
        name="my-schema",
        features=[
            ln.Feature(name="sample_id", dtype=str).save(),
            ln.Feature(name="sample_name", dtype=str).save().with_config(optional=True),
            ln.Feature(name="sample_type", dtype=str).save(),
        ],
    ).save()
    assert schema.optionals.get().to_list("name") == [
        "sample_name",
    ]

    # set sample_type to optional
    with pytest.raises(
        TypeError,
        match=re.escape("features must be a list of Feature records!"),
    ):
        schema.optionals.set("test")
    schema.optionals.set([ln.Feature.get(name="sample_type")])
    assert schema.optionals.get().to_list("name") == ["sample_type"]
    # add sample_name to optionals
    with pytest.raises(
        TypeError,
        match=re.escape("features must be a list of Feature records!"),
    ):
        schema.optionals.add("test")
    schema.optionals.add(ln.Feature.get(name="sample_name"))
    assert schema.optionals.get().to_list("name") == ["sample_name", "sample_type"]

    # clean up
    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


def test_schema_ordered_set(df):
    # create features with a different order so that sample_id is not the first
    ln.Feature(name="sample_name", dtype=str).save()
    ln.Feature(name="sample_type", dtype=str).save()
    ln.Feature(name="sample_id", dtype=str).save()

    # create an ordered schema with sample_id as the first feature
    schema = ln.Schema(
        name="my-schema",
        features=[
            ln.Feature(name="sample_id", dtype=str).save(),
            ln.Feature(name="sample_name", dtype=str).save(),
            ln.Feature(name="sample_type", dtype=str).save(),
        ],
        ordered_set=True,
    ).save()

    assert ln.curators.DataFrameCurator(df, schema=schema).validate() is None

    # clean up
    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


@pytest.mark.parametrize("minimal_set", [True, False])
def test_schema_minimal_set_var_allowed(minimal_set):
    """Independent of the value of minimal_set, invalid ensembl gene IDs are allowed."""
    adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData")
    adata.var_names = [adata.var_names[0], adata.var_names[1], "NOT_VALID_ENSEMBL"]

    var_schema = ln.Schema(
        itype=bt.Gene.ensembl_gene_id,
        minimal_set=minimal_set,
    ).save()
    schema = ln.Schema(otype="AnnData", slots={"var.T": var_schema}).save()
    curator = ln.curators.AnnDataCurator(adata, schema)
    curator.validate()

    # clean up
    schema.delete(permanent=True)


def test_schema_maximal_set_var():
    """If maximal_set is True, invalid ensembl gene IDs are not allowed."""
    adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData")
    adata.var_names = [adata.var_names[0], adata.var_names[1], "NOT_VALID_ENSEMBL"]

    var_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save()
    schema = ln.Schema(otype="AnnData", slots={"var.T": var_schema}).save()

    curator = ln.curators.AnnDataCurator(adata, schema)
    with pytest.raises(ValidationError) as error:
        curator.validate()
    assert error.exconly() == (
        "lamindb.errors.ValidationError: 1 term not validated in feature 'columns' in slot 'var.T': 'NOT_VALID_ENSEMBL'\n"
        "    → fix typos, remove non-existent values, or save terms via: curator.slots['var.T'].cat.add_new_from('columns')"
    )

    # clean up
    schema.delete(permanent=True)


def test_feature_dtype_path():
    df = pd.DataFrame(
        {
            "sample": ["Sample_X", "Sample_Y", "Sample_Y"],
            "fastq_1": [
                "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz",
                "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R1_001.fastq.gz",
                "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R1_001.fastq.gz",
            ],
            "fastq_2": [
                "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz",
                "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R2_001.fastq.gz",
                "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R2_001.fastq.gz",
            ],
            "expected_cells": [5000, 5000, 5000],
        }
    )

    nextflow_schema = ln.Schema(
        name="nf-core/scrnaseq pipeline - params.input schema",
        description="https://github.com/nf-core/scrnaseq/blob/4.0.0/assets/schema_input.json",
        features=[
            ln.Feature(
                name="sample",
                dtype="str",
                nullable=False,
                description="Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (_).",
            ).save(),
            ln.Feature(
                name="fastq_1",
                dtype="path",
                nullable=False,
                description="Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension “.fastq.gz” or “.fq.gz”.",
            ).save(),
            ln.Feature(
                name="fastq_2",
                dtype="path",
                description="Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension “.fastq.gz” or “.fq.gz”.",
            ).save(),
            ln.Feature(
                name="expected_cells",
                dtype=int,
                description="Number of cells expected for a sample. Must be an integer. If multiple rows are provided for the same sample, this must be the same number for all rows, i.e. the total number of expected cells for the sample.",
            ).save(),
            ln.Feature(
                name="seq_center",
                dtype=str,
                description="Sequencing center for the sample. If multiple rows are provided for the same sample, this must be the same string for all rows. Samples sequenced at different centers are considered different samples and must have different identifiers.",
            ).save(),
            ln.Feature(
                name="sample_type",
                dtype=str,
                description='"atac", "gex"',
            ).save(),
            ln.Feature(
                name="feature_type",
                dtype=str,
                description='"gex", "vdj", "ab", "crispr", "cmo"',
            ).save(),
        ],
    ).save()

    nextflow_schema.optionals.set(
        [
            ln.Feature.get(name="expected_cells"),
            ln.Feature.get(name="seq_center"),
            ln.Feature.get(name="sample_type"),
            ln.Feature.get(name="feature_type"),
        ]
    )

    curator = ln.curators.DataFrameCurator(df, schema=nextflow_schema)
    assert curator.validate() is None

    # clean up
    nextflow_schema.delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


def test_cat_filters_specific_source_uid(df_disease, disease_ontology_old):
    """Specific source_uid passed to the `cat_filters`"""
    feature = ln.Feature(
        name="disease",
        dtype=bt.Disease,
        cat_filters={"source__uid": disease_ontology_old.uid},
    ).save()
    schema = ln.Schema([feature], name="test schema").save()
    curator = ln.curators.DataFrameCurator(df_disease, schema)
    try:
        curator.validate()
    except ln.errors.ValidationError as error:
        assert (
            "2 terms not validated in feature 'disease': 'HDAC4-related haploinsufficiency syndrome', 'SAMD9L-related spectrum and myeloid neoplasm risk'"
            in str(error)
        )
    schema.delete(permanent=True)
    feature.delete(permanent=True)


def test_cat_filters_specific_source(df_disease, disease_ontology_old):
    """Specific Source record passed to the `cat_filters`"""
    feature = ln.Feature(
        name="disease",
        dtype=bt.Disease,
        cat_filters={"source": disease_ontology_old},
    ).save()
    schema = ln.Schema([feature], name="test schema").save()
    curator = ln.curators.DataFrameCurator(df_disease, schema)
    try:
        curator.validate()
    except ln.errors.ValidationError as error:
        assert (
            "2 terms not validated in feature 'disease': 'HDAC4-related haploinsufficiency syndrome', 'SAMD9L-related spectrum and myeloid neoplasm risk'"
            in str(error)
        )

    schema.delete(permanent=True)
    feature.delete(permanent=True)


def test_cat_filters_multiple_relation_filters(df_disease, disease_ontology_old):
    """Multiple relation filters in cat_filters"""
    # TODO: needs to also work if both filters are from the same related model!!!
    feature = ln.Feature(
        name="disease",
        dtype=bt.Disease,
        cat_filters={
            "source__uid": disease_ontology_old.uid,
            "created_by__handle": ln.setup.settings.user.handle,
        },
    ).save()
    schema = ln.Schema([feature], name="test schema").save()
    curator = ln.curators.DataFrameCurator(df_disease, schema)
    try:
        curator.validate()
    except ln.errors.ValidationError as error:
        assert (
            "2 terms not validated in feature 'disease': 'HDAC4-related haploinsufficiency syndrome', 'SAMD9L-related spectrum and myeloid neoplasm risk'"
            in str(error)
        )
    schema.delete(permanent=True)
    feature.delete(permanent=True)


def test_curate_columns(df):
    """Test that columns can be curated."""
    schema = ln.Schema(
        name="sample schema",
        features=[
            ln.Feature(name="sample_id", dtype="str").save(),
            ln.Feature(name="sample_name", dtype="str").save(),
            ln.Feature(name="sample_type", dtype="str").save(),
        ],
    ).save()

    # make one column name invalid
    df.rename(columns={"sample_name": "sample_name_name"}, inplace=True)

    curator = ln.curators.DataFrameCurator(df, schema)
    try:
        curator.validate()
    except ln.errors.ValidationError as error:
        assert "column 'sample_name' not in dataframe" in str(error)

    # now fix the column
    df.rename(columns={"sample_name_name": "sample_name"}, inplace=True)
    curator.validate()

    schema.delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


def test_wrong_datatype(df):
    feature = ln.Feature(name="sample_id", dtype=ln.ULabel).save()
    schema = ln.Schema(features=[feature]).save()

    curator = ln.curators.DataFrameCurator(df, schema)
    with pytest.raises(ln.errors.ValidationError) as excinfo:
        curator.validate()

    assert "expected series 'sample_id' to have type category, got object" in str(
        excinfo.value
    )
    assert (
        "Hint: Consider setting `feature.coerce = True` to attempt coercing values during validation to the required dtype."
        in str(excinfo.value)
    )

    schema.delete(permanent=True)
    feature.delete(permanent=True)


def test_hash_index_feature(df):
    df_index = df.set_index("sample_id")
    sample_name = ln.Feature(name="sample_name", dtype="str").save()
    sample_name.uid = "OpQAD5Ifu89t"
    sample_name.save()
    sample_type = ln.Feature(name="sample_type", dtype="str").save()
    sample_type.uid = "7I4u69RiCAVy"
    sample_type.save()
    sample_id = ln.Feature(name="sample_id", dtype="str").save()
    sample_id.uid = "uValv1YfEQib"
    sample_id.save()
    schema_index = ln.Schema(
        name="sample schema with index",
        features=[
            sample_name,
            sample_type,
        ],
        index=sample_id,
    ).save()
    assert schema_index.hash == "drtQMP4N4xEebS49DO-9Jw"

    schema = ln.Schema(
        name="sample schema",
        features=[
            sample_id,
            sample_name,
            sample_type,
        ],
    ).save()
    assert schema.hash == "Z_dmk1WendD15s2FyBW1HA"

    artifact = ln.Artifact.from_dataframe(
        df_index, key="curated_df.parquet", schema=schema_index
    ).save()
    assert artifact.schemas.all().one() == schema_index

    # clean up
    artifact.delete(permanent=True)
    schema_index.delete(permanent=True)
    schema.delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


def test_add_new_from_subtype(df):
    """Test that add_new_from works with subtypes."""
    sample_type = ln.Record(name="SampleType", is_type=True).save()
    ln.Record(name="Type A", type=sample_type).save()
    schema = ln.Schema(
        name="sample schema",
        features=[
            ln.Feature(name="sample_id", dtype="str").save(),
            ln.Feature(name="sample_name", dtype="str").save(),
            ln.Feature(name="sample_type", dtype=sample_type).save(),
        ],
        coerce=True,
    ).save()

    curator = ln.curators.DataFrameCurator(df, schema)
    try:
        curator.validate()
    except ln.errors.ValidationError as error:
        assert "1 term not validated in feature 'sample_type': 'Type B'" in str(error)

    # add new from subtype
    curator.cat.non_validated["sample_type"]
    curator.cat.add_new_from("sample_type")
    curator.validate()
    assert sample_type.records.to_list("name") == ["Type A", "Type B"]

    # clean up
    schema.delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)
    ln.Record.filter().update(type=None)
    ln.Record.filter().delete(permanent=True)


def test_index_feature_exclusion_from_categoricals(df):
    df_indexed = df.set_index("sample_id")

    sample_type_feature = ln.Feature(name="sample_type", dtype="cat[ULabel]").save()
    sample_id_feature = ln.Feature(name="sample_id", dtype="cat[ULabel]").save()

    # schema with sample_id as index (not in features)
    schema = ln.Schema(features=[sample_type_feature], index=sample_id_feature).save()

    curator = ln.curators.DataFrameCurator(df_indexed, schema)

    # Verify that only sample_type is in categoricals, not sample_id (index)
    categoricals_names = [
        f.name for f in curator._atomic_curator._cat_manager._categoricals
    ]
    assert "sample_type" in categoricals_names
    assert "sample_id" not in categoricals_names

    # Verify the cat_vectors do not include the index feature
    cat_vector_keys = list(curator.cat._cat_vectors.keys())
    assert "sample_type" in cat_vector_keys
    assert "sample_id" not in cat_vector_keys
    assert "columns" in cat_vector_keys

    # clean up
    schema.delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


================================================
FILE: tests/core/test_data_migrations.py
================================================
"""Tests for PostgreSQL data migrations."""

import os

import lamindb as ln
import pytest


@pytest.mark.skipif(
    os.getenv("LAMINDB_TEST_DB_VENDOR") != "postgresql",
    reason="PostgreSQL-specific migration test",
)
def test_migrate_auxiliary_fields_postgres():
    """Test PostgreSQL migration of auxiliary fields for models.

    This test verifies that migrate_auxiliary_fields_postgres correctly migrates:

    **Artifact:**
    - _save_completed from _aux['af']['0']

    **Run:**
    - cli_args from _aux['af']['0']

    **Feature:**
    - default_value from _aux['af']['0']
    - nullable from _aux['af']['1'] (default: True)
    - coerce from _aux['af']['2'] (default: False)
    - For type features, all values are set to NULL

    **Schema:**
    - coerce from _aux['af']['0']
    - flexible from _aux['af']['2'] (or computes from n_members)
    - Converts negative n_members to NULL
    - For type schemas, all values are set to NULL
    - Preserves '1' (optionals) and '3' (index_feature_uid) in _aux
    """
    from django.db import connection
    from lamindb.models.schema import migrate_auxiliary_fields_postgres

    # === Setup test data ===

    # Create a Transform and Run for testing
    transform = ln.Transform(key="test_migration_transform").save()
    run = ln.Run(transform=transform).save()

    # Create an Artifact for testing
    artifact = ln.Artifact(".gitignore", key="test_migration_artifact").save()

    # Create Features for testing (type and regular)
    type_feature = ln.Feature(
        name="TestMigrationTypeFeat", dtype=str, is_type=True
    ).save()
    regular_feature = ln.Feature(name="test_migration_regular_feat", dtype=str).save()

    # Create Schemas for testing (type and regular)
    type_schema = ln.Schema(name="TestMigrationTypeSchema", is_type=True).save()
    feature_for_schema1 = ln.Feature(
        name="test_migration_schema_feat1", dtype=str
    ).save()
    feature_for_schema2 = ln.Feature(
        name="test_migration_schema_feat2", dtype=str
    ).save()
    regular_schema = ln.Schema(
        name="TestMigrationRegularSchema",
        features=[feature_for_schema1, feature_for_schema2],
        coerce=True,
        flexible=True,
    ).save()

    # === Add _save_completed column temporarily (removed in migration 0173) ===
    with connection.cursor() as cursor:
        cursor.execute(
            """
            DO $$
            BEGIN
                IF NOT EXISTS (
                    SELECT 1 FROM information_schema.columns
                    WHERE table_name = 'lamindb_artifact' AND column_name = '_save_completed'
                ) THEN
                    ALTER TABLE lamindb_artifact ADD COLUMN _save_completed BOOLEAN;
                END IF;
            END $$;
            """
        )

    # === Set old-style _aux data to simulate pre-migration state ===
    with connection.cursor() as cursor:
        # Artifact: set _aux with af containing _save_completed value
        cursor.execute(
            """
            UPDATE lamindb_artifact
            SET _aux = '{"af": {"0": true}}'::jsonb,
                _save_completed = NULL
            WHERE id = %s
            """,
            [artifact.id],
        )

        # Run: set _aux with af containing cli_args value
        cursor.execute(
            """
            UPDATE lamindb_run
            SET _aux = '{"af": {"0": "--verbose --debug"}}'::jsonb,
                cli_args = NULL
            WHERE id = %s
            """,
            [run.id],
        )

        # Feature (type): set _aux with af keys that should result in NULL values
        cursor.execute(
            """
            UPDATE lamindb_feature
            SET _aux = '{"af": {"0": "default_val", "1": false, "2": true}}'::jsonb,
                default_value = NULL,
                nullable = NULL,
                coerce = NULL
            WHERE id = %s
            """,
            [type_feature.id],
        )

        # Feature (regular): set _aux with af keys for migration
        cursor.execute(
            """
            UPDATE lamindb_feature
            SET _aux = '{"af": {"0": "my_default", "1": false, "2": true}}'::jsonb,
                default_value = NULL,
                nullable = NULL,
                coerce = NULL
            WHERE id = %s
            """,
            [regular_feature.id],
        )

        # Schema (type): set _aux with af keys that should be cleaned
        cursor.execute(
            """
            UPDATE lamindb_schema
            SET _aux = '{"af": {"0": true, "2": false}}'::jsonb,
                coerce = NULL,
                flexible = NULL
            WHERE id = %s
            """,
            [type_schema.id],
        )

        # Schema (regular): set _aux with af keys including optionals (key "1")
        cursor.execute(
            """
            UPDATE lamindb_schema
            SET _aux = '{"af": {"0": true, "1": ["uid1", "uid2"], "2": true}}'::jsonb,
                coerce = NULL,
                flexible = NULL
            WHERE id = %s
            """,
            [regular_schema.id],
        )

    # === Run the migration function ===
    with connection.schema_editor() as schema_editor:
        migrate_auxiliary_fields_postgres(schema_editor)

    # === Refresh all objects from database ===
    run.refresh_from_db()
    type_feature.refresh_from_db()
    regular_feature.refresh_from_db()
    type_schema.refresh_from_db()
    regular_schema.refresh_from_db()

    # === Verify Artifact migration ===
    with connection.cursor() as cursor:
        cursor.execute(
            "SELECT _save_completed, _aux FROM lamindb_artifact WHERE id = %s",
            [artifact.id],
        )
        row = cursor.fetchone()
        assert row[0] is True  # _save_completed from _aux['af']['0']
        # _aux should have 'af' removed (was only key)
        assert row[1] is None or "af" not in (
            row[1] if isinstance(row[1], dict) else {}
        )

    # === Verify Run migration ===
    assert run.cli_args == "--verbose --debug"  # from _aux['af']['0']
    # _aux should have 'af' removed
    assert run._aux is None or "af" not in run._aux

    # === Verify Feature (type) migration ===
    # Type features should have all values set to NULL
    assert type_feature.default_value is None
    assert type_feature.nullable is None
    assert type_feature.coerce is None
    # _aux should have 'af' removed
    assert type_feature._aux is None or "af" not in type_feature._aux

    # === Verify Feature (regular) migration ===
    assert regular_feature.default_value == "my_default"  # from _aux['af']['0']
    assert regular_feature.nullable is False  # from _aux['af']['1']
    assert regular_feature.coerce is True  # from _aux['af']['2']
    # _aux should have 'af' removed
    assert regular_feature._aux is None or "af" not in regular_feature._aux

    # === Verify Schema (type) migration ===
    assert type_schema.coerce is None
    assert type_schema.flexible is None
    assert type_schema.n_members is None
    # _aux should either be None or not have '0' and '2' keys in 'af'
    if type_schema._aux is not None and "af" in type_schema._aux:
        assert "0" not in type_schema._aux["af"]
        assert "2" not in type_schema._aux["af"]

    # === Verify Schema (regular) migration ===
    assert regular_schema.coerce is True  # from _aux['af']['0']
    assert regular_schema.flexible is True  # from _aux['af']['2']
    # _aux should preserve key '1' (optionals)
    assert regular_schema._aux is not None
    assert "af" in regular_schema._aux
    assert "1" in regular_schema._aux["af"]
    assert regular_schema._aux["af"]["1"] == ["uid1", "uid2"]
    # Keys '0' and '2' should be removed
    assert "0" not in regular_schema._aux["af"]
    assert "2" not in regular_schema._aux["af"]

    # === Clean up: remove temporary column and delete records ===
    with connection.cursor() as cursor:
        cursor.execute(
            """
            DO $$
            BEGIN
                IF EXISTS (
                    SELECT 1 FROM information_schema.columns
                    WHERE table_name = 'lamindb_artifact' AND column_name = '_save_completed'
                ) THEN
                    ALTER TABLE lamindb_artifact DROP COLUMN _save_completed;
                END IF;
            END $$;
            """
        )

    regular_schema.delete(permanent=True)
    type_schema.delete(permanent=True)
    feature_for_schema1.delete(permanent=True)
    feature_for_schema2.delete(permanent=True)
    regular_feature.delete(permanent=True)
    type_feature.delete(permanent=True)
    artifact.delete(permanent=True)
    run.delete(permanent=True)
    transform.delete(permanent=True)


================================================
FILE: tests/core/test_db.py
================================================
import lamindb as ln


def test_create_to_load():
    transform = ln.Transform(version="0", key="test", kind="pipeline")
    transform.save()
    run = ln.Run(transform=transform)
    run.save()
    ln.Storage.get(root=str(ln.setup.settings.storage.root))


================================================
FILE: tests/core/test_delete.py
================================================
import bionty as bt
import lamindb as ln
import pytest


@pytest.mark.parametrize("permanent", [True, False])
def test_delete_qs(permanent):
    """Test deletion behavior for small (1) and large (>=2) querysets.

    Small querysets delete individually, large ones trigger bulk delete."""
    ln.settings.creation.search_names = False
    labels = [ln.Record(name=f"label_{i}") for i in range(3)]
    ln.settings.creation.search_names = True
    ln.save(labels)
    ln.Record.filter(name__startswith="label_").delete(permanent=permanent)
    assert ln.Record.filter(name__startswith="label_", branch_id=-1).count() == (
        0 if permanent else 3
    )
    assert ln.ULabel.filter(name__startswith="label_").count() == 0


def test_recreate_soft_deleted_record():
    # testing soft delete and recreate with postgres (sqlite is tested in curators/test_records.py)
    # soft delete a record, then recreate it with some changes
    record = bt.Ethnicity.from_source(ontology_id="HANCESTRO:0006").save()
    assert record.branch_id == 1
    record.delete()
    assert record.branch_id == -1
    # now recreate the same record from ontology_id with a different description
    # there's a unique constraint on ontology_id, so this should recover the trashed record
    record = bt.Ethnicity.from_source(ontology_id="HANCESTRO:0006")
    record.description = "new description"
    record.save()
    # now this record is recovered from the trash with the new description
    assert record.branch_id == 1
    assert record.description == "new description"
    bt.Ethnicity.objects.filter().delete()


================================================
FILE: tests/core/test_feature.py
================================================
import bionty as bt
import lamindb as ln
import pandas as pd
import pytest
from lamindb.errors import ValidationError
from lamindb.models.feature import serialize_pandas_dtype
from pandas.api.types import is_string_dtype


@pytest.fixture(scope="module")
def dict_data():
    return {
        "dict_feat1": 42,
        "dict_feat2": 3.14,
        "dict_feat3": "somestring",  # string (ambiguous cat ? str)
        "dict_feat4": True,
        "dict_feat5": [1, 2, 3],
        "dict_feat6": ["a", "b", "c"],  # list[str] (ambiguous list[cat ? str])
        "dict_feat7": {"key": "value"},
    }


def test_feature_init():
    # positional args not supported
    with pytest.raises(ValueError):
        ln.Feature("x")

    # dtype required unless is_type=True
    with pytest.raises(ValidationError):
        ln.Feature(name="feat")

    # is OK if also is_type is passed
    ln.Feature(name="Feat", is_type=True)

    # invalid dtype string
    with pytest.raises(ValueError):
        ln.Feature(name="feat", dtype="x")

    # categorical dtype must specify valid types
    with pytest.raises(ValidationError):
        ln.Feature(name="feat", dtype="cat[1]")

    # ensure feat1 does not exist
    if feat1 := ln.Feature.filter(name="feat1").one_or_none() is not None:
        feat1.delete(permanent=True)

    feat1 = ln.Feature(name="feat", dtype="str").save()
    # duplicate name with different dtype should fail
    with pytest.raises(ValidationError) as error:
        ln.Feature(name="feat", dtype=ln.ULabel)
    assert (
        error.exconly()
        == "lamindb.errors.ValidationError: Feature feat already exists with dtype str, you passed cat[ULabel]"
    )
    feat1.delete(permanent=True)

    # string and list syntax for categorical dtypes should be equivalent and work
    feat2 = ln.Feature(name="feat2", dtype="str", description="feat2").save()
    feat2_again = ln.Feature(name="feat2", dtype="str", description="feat2").save()
    assert feat2 == feat2_again
    feat2.delete(permanent=True)

    # categorical dtype with union of registries using string syntax must be valid
    feature = ln.Feature(name="feat1", dtype="cat[Record|bionty.Gene]")
    assert feature._dtype_str == "cat[Record|bionty.Gene]"
    # categorical dtype with union of registries using objects must be valid
    feature = ln.Feature(name="feat1", dtype=[ln.Record, bt.Gene])
    assert feature._dtype_str == "cat[Record|bionty.Gene]"

    # dtype with field name before bracket filters must be valid
    feature = ln.Feature(
        name="gene_feature", dtype="cat[bionty.Gene.ensembl_gene_id[organism='human']]"
    )
    print(feature._dtype_str)
    assert "bionty.Gene" in feature._dtype_str
    assert "ensembl_gene_id" in feature._dtype_str
    assert "organism='human'" in feature._dtype_str


# @pytest.mark.skipif(
#     os.getenv("LAMINDB_TEST_DB_VENDOR") == "sqlite", reason="Postgres-only"
# )
# def test_cannot_mutate_dtype():
#     feature = ln.Feature(name="feature", dtype=str).save()
#     feature._dtype_str = int
#     with pytest.raises(django.db.utils.IntegrityError) as error:
#         feature.save()
#     assert "dtype field is immutable and cannot be changed" in error.exconly()
#     feature.delete(permanent=True)


# def test_cat_filters_dtype():
#     feature = ln.Feature(
#         name="disease",
#         dtype=bt.Disease,
#         cat_filters={
#             "source__uid": "4a3ejKuf"
#         },  # uid corresponds to disease_ontology_old.uid
#     ).save()

#     assert feature._dtype_str == "cat[bionty.Disease[source__uid='4a3ejKuf']]"

#     feature.delete(permanent=True)


def test_cat_filters_empty_filter():
    # empty filter values should be rejected
    with pytest.raises(ValidationError) as error:
        ln.Feature(name="feat_empty", dtype=bt.Disease, cat_filters={"source__uid": ""})
    assert (
        "lamindb.errors.ValidationError: Empty value in filter source__uid"
        in error.exconly()
    )


def test_cat_filters_invalid_field_name():
    # invalid filter field names should be rejected
    source = bt.Source(
        name="", description="", organism="", entity="", version=""
    ).save()
    with pytest.raises(ValidationError) as error:
        ln.Feature(
            name="feat_invalid_attr",
            dtype=bt.Disease,
            cat_filters={"source__invalid_field": source},
        )
    assert (
        "lamindb.errors.ValidationError: SQLRecord Source has no attribute 'invalid_field' in filter source__invalid_field"
        in error.exconly()
    )
    source.delete(permanent=True)


def test_cat_filters_artifact_schema_filter():
    schema_feature = ln.Feature(name="schema_filter_column", dtype=str).save()
    schema = ln.Schema(name="schema_filter_schema", features=[schema_feature]).save()
    try:
        feature = ln.Feature(
            name="artifact_input",
            dtype=ln.Artifact,
            cat_filters={"schema": schema},
        )
        assert feature._dtype_str == f"cat[Artifact[schema__uid='{schema.uid}']]"
    finally:
        schema.delete(permanent=True)
        schema_feature.delete(permanent=True)


def test_feature_from_df():
    df = pd.DataFrame(
        {
            "feat1": [1, 2, 3],
            "feat2": [3.1, 4.2, 5.3],
            "feat3": pd.Categorical(["cond1", "cond2", "cond2"]),
            "feat4": ["id1", "id2", "id3"],
            "rando_feature": ["rando1", "rando2", "rando3"],
        }
    )
    if feat1 := ln.Feature.filter(name="feat1").one_or_none() is not None:
        feat1.delete(permanent=True)
    features = ln.Feature.from_dataframe(df.iloc[:, :4]).save()
    artifact = ln.Artifact.from_dataframe(df, description="test").save()
    # test for deprecated add_feature_set
    schema = ln.Schema(features).save()
    artifact.features._add_schema(schema, slot="columns")
    features = artifact.features.slots["columns"].features.all()
    assert len(features) == len(df.columns[:4])
    [col for col in df.columns if is_string_dtype(df[col])]
    categoricals = {
        col: df[col] for col in df.columns if isinstance(df[col], pd.CategoricalDtype)
    }
    for feature in features:
        if feature.name in categoricals:
            assert feature._dtype_str == "cat"
        else:
            orig_type = df[feature.name].dtype
            assert feature._dtype_str == serialize_pandas_dtype(orig_type)
    for feature in features:
        feature.save()
    labels = [ln.Record(name=name) for name in df["feat3"].unique()]
    ln.save(labels)
    feature = ln.Feature.get(name="feat3")
    with pytest.raises(ValidationError) as err:
        artifact.labels.add(labels, feature=feature)
    assert (
        err.exconly()
        == "lamindb.errors.ValidationError: Cannot manually annotate a feature measured *within* the dataset. Please use a Curator."
    )
    extfeature = ln.Feature(name="extfeat", dtype="str").save()
    with pytest.raises(ValidationError) as err:
        artifact.labels.add(labels, feature=extfeature)
    assert (
        err.exconly()
        == f"lamindb.errors.ValidationError: Feature {extfeature.name} needs dtype='cat' for label annotation, currently has dtype='str'"
    )

    # clean up
    artifact.delete(permanent=True)
    ln.Schema.filter().delete(permanent=True)
    ln.Record.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


def test_feature_from_dict(dict_data):
    # defaults to str for ambiguous types
    features = ln.Feature.from_dict(dict_data)
    assert len(features) == len(dict_data)
    assert features[0]._dtype_str == "int"
    assert features[1]._dtype_str == "float"
    assert features[2]._dtype_str == "str"
    assert features[3]._dtype_str == "bool"
    assert features[4]._dtype_str == "list[int]"
    assert features[5]._dtype_str == "list[str]"
    assert features[6]._dtype_str == "dict"

    # Wrong field
    with pytest.raises(ValueError) as e:
        ln.Feature.from_dict(dict_data, field=ln.Record.name)
    assert "field must be a Feature FieldAttr" in str(e.value)

    # Explicit field
    features_with_field = ln.Feature.from_dict(dict_data, field=ln.Feature.name)
    assert len(features_with_field) == len(dict_data)


def test_feature_from_dict_type(dict_data):
    feature_type = ln.Feature(name="Testdata_feature_type", is_type=True).save()
    features = ln.Feature.from_dict(dict_data, type=feature_type).save()
    for feature in features:
        assert feature.type.name == "Testdata_feature_type"
    ln.Feature.filter(type__isnull=False).delete(permanent=True)
    feature_type.delete(permanent=True)


def test_feature_query_by_dtype():
    """Test querying Feature by dtype (deprecated) and _dtype_str."""
    str_feat = ln.Feature(name="test_str_feat", dtype=str).save()
    int_feat = ln.Feature(name="test_int_feat", dtype=int).save()
    try:
        # Test querying by _dtype_str (current way)
        str_features = ln.Feature.filter(_dtype_str="str", name="test_str_feat")
        assert str_features.count() == 1
        assert str_features.first() == str_feat

        str_features = ln.Feature.filter(dtype_as_str="str", name="test_str_feat")
        assert str_features.count() == 1
        assert str_features.first() == str_feat

        # Test querying by dtype (deprecated) - should work but issue warning
        with pytest.warns(
            DeprecationWarning,
            match="Querying Feature by `dtype` is deprecated.*Notice the new dtype encoding format",
        ):
            str_features_deprecated = ln.Feature.filter(
                dtype="str", name="test_str_feat"
            )
            assert str_features_deprecated.count() == 1
            assert str_features_deprecated.first() == str_feat
    finally:
        # Clean up
        str_feat.delete(permanent=True)
        int_feat.delete(permanent=True)


================================================
FILE: tests/core/test_feature_dtype.py
================================================
import datetime

import bionty as bt
import lamindb as ln
import pandas as pd
import pytest
from lamindb import Record
from lamindb.errors import ValidationError
from lamindb.models.feature import (
    dtype_as_object,
    parse_dtype,
    parse_filter_string,
    resolve_relation_filters,
    serialize_dtype,
)


@pytest.fixture
def organism():
    organism = bt.Organism(name="test_organism")
    organism.uid = "testuid2"
    organism.save()
    return organism


# -----------------------------------------------------------------------------
# serializing dtypes
# -----------------------------------------------------------------------------


def test_serialize_basic_dtypes():
    assert serialize_dtype(int) == "int"
    assert serialize_dtype(float) == "float"
    assert serialize_dtype(str) == "str"
    assert serialize_dtype(bool) == "bool"
    assert serialize_dtype(dict) == "dict"
    # assert serialize_dtype(bytes) == "bytes"  # not yet supported
    assert serialize_dtype(datetime.datetime) == "datetime"
    assert serialize_dtype(datetime.date) == "date"


def test_serialize_basic_list_dtypes():
    assert serialize_dtype(list[int]) == "list[int]"
    assert serialize_dtype(list[float]) == "list[float]"
    assert serialize_dtype(list[str]) == "list[str]"
    assert serialize_dtype(list[bool]) == "list[bool]"
    assert serialize_dtype(list[dict]) == "list[dict]"
    assert serialize_dtype(list[datetime.datetime]) == "list[datetime]"
    assert serialize_dtype(list[datetime.date]) == "list[date]"


def test_seralize_pandas_numpy_dtypes():
    series = pd.Series([1, 4, 0, 10, 9], dtype="uint")
    assert series.dtype.name == "uint64"
    assert serialize_dtype(series.dtype) == "int"


def test_serialize_user(ccaplog):
    # correct way through Python object and serialize_dtype()
    feature = ln.Feature(name="user_feat", dtype=ln.User)
    assert feature._dtype_str == "cat[User]"
    # legacy way through parse_dtype()
    feature = ln.Feature(name="user_feat", dtype="cat[User]")
    assert (
        "rather than passing a string 'cat[User]' to dtype, consider passing a Python object"
        in ccaplog.text
    )
    assert feature._dtype_str == "cat[User]"


def test_serialize_record_objects():
    insitute_type = ln.Record(name="InstituteA", is_type=True)
    with pytest.raises(ln.errors.InvalidArgument) as error:
        serialize_dtype(insitute_type)
    assert (
        f"Cannot serialize unsaved objects. Save {insitute_type} via `.save()`."
        in error.exconly()
    )
    insitute_type.save()
    lab_type = ln.Record(name="LabB", type=insitute_type, is_type=True).save()
    sample_type = ln.Record(name="Sample", type=lab_type, is_type=True).save()
    # New UID-based format: cat[Record[uid]] instead of cat[Record[Parent[Child]]]
    serialized_str = f"cat[Record[{sample_type.uid}]]"
    feature = ln.Feature(name="sample_feature", dtype=sample_type).save()
    assert feature._dtype_str == serialized_str
    assert feature.dtype == "cat[Record[InstituteA[LabB[Sample]]]]"
    feature.delete(permanent=True)
    assert serialize_dtype(sample_type) == serialized_str
    with pytest.raises(ln.errors.IntegrityError) as error:
        parse_dtype("cat[Record[Sample]]", check_exists=True, old_format=True)
    assert (
        "No Record type found matching subtypes ['Sample'] for field `.name`"
        in error.exconly()
    )
    sample = ln.Record(name="sample").save()
    with pytest.raises(ln.errors.InvalidArgument) as error:
        parse_dtype(f"cat[Record[{sample.uid}]]", check_exists=True)
    assert (
        f"The resolved Record 'sample' (uid='{sample.uid}') is not a type: is_type is False."
        in error.exconly()
    )
    with pytest.raises(ln.errors.InvalidArgument) as error:
        serialize_dtype(sample)
    assert (
        "Cannot serialize non-type Record 'sample'. Only types (is_type=True) are allowed in dtypes."
        in error.exconly()
    )
    sample_type.delete(permanent=True)
    lab_type.delete(permanent=True)
    insitute_type.delete(permanent=True)
    sample.delete(permanent=True)


def test_serialize_union_of_registries():
    serialized_str = "cat[Record|bionty.Gene]"
    assert serialize_dtype([ln.Record, bt.Gene]) == serialized_str
    serialized_str = "cat[bionty.CellType|bionty.CellLine]"
    assert serialize_dtype([bt.CellType, bt.CellLine]) == serialized_str


def test_serialize_with_field_information():
    serialized_str = "cat[bionty.Gene.ensembl_gene_id]"
    assert serialize_dtype(bt.Gene.ensembl_gene_id) == serialized_str
    serialized_str = "cat[bionty.CellType.uid|bionty.CellLine.uid]"
    assert serialize_dtype([bt.CellType.uid, bt.CellLine.uid]) == serialized_str


# -----------------------------------------------------------------------------
# parsing serialized dtypes
# -----------------------------------------------------------------------------


def test_simple_record_with_subtype_and_field():
    # Create a Record type to get its UID
    customer_type = ln.Record(name="Customer", is_type=True).save()
    dtype_str = f"cat[Record[{customer_type.uid}].name]"
    result = parse_dtype(dtype_str)
    assert len(result) == 1
    assert result[0] == {
        "registry_str": "Record",
        "filter_str": "",
        "field_str": "name",
        "registry": Record,
        "field": Record.name,
        "record_uid": customer_type.uid,
    }
    customer_type.delete(permanent=True)


def test_multiple_records_with_subtypes_and_fields():
    # Create Record types to get their UIDs
    customer_type = ln.Record(name="Customer", is_type=True).save()
    supplier_type = ln.Record(name="Supplier", is_type=True).save()
    dtype_str = (
        f"cat[Record[{customer_type.uid}].name|Record[{supplier_type.uid}].name]"
    )
    result = parse_dtype(dtype_str)
    assert len(result) == 2
    assert result[0] == {
        "registry_str": "Record",
        "filter_str": "",
        "field_str": "name",
        "registry": Record,
        "field": Record.name,
        "record_uid": customer_type.uid,
    }
    assert result[1] == {
        "registry_str": "Record",
        "filter_str": "",
        "field_str": "name",
        "registry": Record,
        "field": Record.name,
        "record_uid": supplier_type.uid,
    }
    customer_type.delete(permanent=True)
    supplier_type.delete(permanent=True)


def test_bionty_celltype_with_field():
    dtype_str = "cat[bionty.CellType.ontology_id]"
    result = parse_dtype(dtype_str)
    assert len(result) == 1
    assert result[0] == {
        "registry_str": "bionty.CellType",
        "filter_str": "",
        "field_str": "ontology_id",
        "registry": bt.CellType,
        "field": bt.CellType.ontology_id,
    }


def test_bionty_perturbations_with_field():
    dtype_str = "cat[bionty.CellType.uid|bionty.CellLine.uid]"
    result = parse_dtype(dtype_str)
    assert len(result) == 2
    assert result[0] == {
        "registry_str": "bionty.CellType",
        "filter_str": "",
        "field_str": "uid",
        "registry": bt.CellType,
        "field": bt.CellType.uid,
    }
    assert result[1] == {
        "registry_str": "bionty.CellLine",
        "filter_str": "",
        "field_str": "uid",
        "registry": bt.CellLine,
        "field": bt.CellLine.uid,
    }


def test_invalid_registry():
    dtype_str = "cat[InvalidRegistry.field]"
    with pytest.raises(ValidationError) as exc_info:
        parse_dtype(dtype_str)
    assert "invalid dtype" in str(exc_info.value)


def test_empty_category():
    dtype_str = "cat[]"
    result = parse_dtype(dtype_str)
    assert result == []


def test_url_dtype_is_supported():
    assert parse_dtype("url") == []
    feature = ln.Feature(name="website", dtype="url")
    assert feature._dtype_str == "url"


def test_malformed_categorical():
    dtype_str = "cat ? str"
    with pytest.raises(ValueError) as err:
        parse_dtype(dtype_str)
    assert err.exconly().startswith(
        f"ValueError: dtype is '{dtype_str}' but has to be one of"
    )
    dtype_str = "cat[Record[Customer.name"
    with pytest.raises(ValueError) as err:
        parse_dtype(dtype_str)
    assert err.exconly().startswith(
        f"ValueError: dtype is '{dtype_str}' but has to be one of"
    )


def test_simple_registry_without_field():
    dtype_str = "cat[Record]"
    result = parse_dtype(dtype_str)
    assert len(result) == 1
    assert result[0] == {
        "registry_str": "Record",
        "filter_str": "",
        "field_str": "name",
        "registry": Record,
        "field": Record.name,
    }


def test_registry_with_subtype_no_field():
    # Create a Record type to get its UID
    customer_type = ln.Record(name="Customer", is_type=True).save()
    dtype_str = f"cat[Record[{customer_type.uid}]]"
    result = parse_dtype(dtype_str)
    assert len(result) == 1
    assert result[0] == {
        "registry_str": "Record",
        "filter_str": "",
        "field_str": "name",
        "registry": Record,
        "field": Record.name,
        "record_uid": customer_type.uid,
    }
    customer_type.delete(permanent=True)


def test_list_of_dtypes():
    # Create a Record type to get its UID
    customer_type = ln.Record(name="Customer", is_type=True).save()
    dtype_str = f"list[cat[Record[{customer_type.uid}]]]"
    result = parse_dtype(dtype_str)
    assert len(result) == 1
    assert result[0] == {
        "registry_str": "Record",
        "filter_str": "",
        "field_str": "name",
        "registry": Record,
        "field": Record.name,
        "record_uid": customer_type.uid,
        "list": True,
    }
    assert serialize_dtype(list[bt.CellLine]) == "list[cat[bionty.CellLine]]"
    customer_type.delete(permanent=True)


def test_registry_with_filter():
    dtype_str = "cat[bionty.Gene.ensembl_gene_id[source__id='abcd']]"
    result = parse_dtype(dtype_str)
    assert len(result) == 1
    assert result[0] == {
        "registry_str": "bionty.Gene",
        "filter_str": "source__id='abcd'",
        "field_str": "ensembl_gene_id",
        "registry": bt.Gene,
        "field": bt.Gene.ensembl_gene_id,
    }


def test_nested_cat_dtypes():
    # Create Record types - the deepest type is UScustomer
    customer_type = ln.Record(name="Customer", is_type=True).save()
    uscustomer_type = ln.Record(
        name="UScustomer", type=customer_type, is_type=True
    ).save()
    dtype_str = f"cat[Record[{uscustomer_type.uid}].name]"
    result = parse_dtype(dtype_str)
    assert len(result) == 1
    assert result[0] == {
        "registry_str": "Record",
        "filter_str": "",
        "field_str": "name",
        "registry": Record,
        "field": Record.name,
        "record_uid": uscustomer_type.uid,
    }
    uscustomer_type.delete(permanent=True)
    customer_type.delete(permanent=True)


def test_nested_cat_with_filter():
    # Create Record types - the deepest type is UScustomer
    # Note: filters in bracket content are not currently supported in UID format
    # This test may need adjustment based on how filters are handled
    customer_type = ln.Record(name="Customer", is_type=True).save()
    uscustomer_type = ln.Record(
        name="UScustomer", type=customer_type, is_type=True
    ).save()
    dtype_str = f"cat[Record[{uscustomer_type.uid}].description]"
    result = parse_dtype(dtype_str)
    assert len(result) == 1
    assert result[0] == {
        "registry_str": "Record",
        "filter_str": "",
        "field_str": "description",
        "registry": Record,
        "field": Record.description,
        "record_uid": uscustomer_type.uid,
    }
    uscustomer_type.delete(permanent=True)
    customer_type.delete(permanent=True)


# -----------------------------------------------------------------------------
# parsing django filter expressions
# -----------------------------------------------------------------------------


def test_feature_dtype():
    feature = ln.Feature(
        name="disease",
        dtype=bt.Disease,
        cat_filters={
            "source__uid": "4a3ejKuf"
        },  # uid corresponds to disease_ontology_old.uid
    ).save()

    result = parse_dtype(feature._dtype_str)
    assert len(result) == 1
    assert result[0] == {
        "registry_str": "bionty.Disease",
        "filter_str": "source__uid='4a3ejKuf'",
        "field_str": "name",
        "registry": bt.Disease,
        "field": bt.Disease.name,
    }

    feature.delete(permanent=True)


def test_cat_filters_incompatible_with_union_dtypes():
    with pytest.raises(ValidationError) as exc_info:
        ln.Feature(
            name="test_feature",
            dtype="cat[Record|bionty.CellType]",
            cat_filters={"source": "test"},
        )
    assert (
        "cat_filters are incompatible with union dtypes: 'cat[Record|bionty.CellType]'"
        in str(exc_info.value)
    )


def test_cat_filters_incompatible_with_nested_dtypes():
    record = ln.Record(name="Customer", is_type=True).save()
    with pytest.raises(ValidationError) as exc_info:
        ln.Feature(
            name="test_feature",
            dtype=record,
            cat_filters={"source": "test"},
        )
    assert (
        f"cat_filters are incompatible with nested dtypes: 'cat[Record[{record.uid}]]'"
        in str(exc_info.value)
    )
    record.delete(permanent=True)


def test_parse_filter_string_basic():
    result = parse_filter_string("parent__id=123, category__name=electronics")
    expected = {
        "parent__id": ("parent", "id", "123"),
        "category__name": ("category", "name", "electronics"),
    }
    assert result == expected


def test_parse_filter_string_direct_fields():
    result = parse_filter_string("name=test, status=active")
    expected = {"name": ("name", None, "test"), "status": ("status", None, "active")}
    assert result == expected


def test_parse_filter_string_empty():
    with pytest.raises(ValueError) as e:
        parse_filter_string("")
        assert "missing '=' sign" in str(e)


def test_parse_filter_string_malformed():
    with pytest.raises(ValueError) as e:
        parse_filter_string("malformed_filter")
        assert "missing '=' sign" in str(e)


def test_parse_filter_string_missing_key():
    with pytest.raises(ValueError) as e:
        parse_filter_string("=someval")
        assert "empty key" in str(e)


def test_parse_filter_string_missing_value():
    with pytest.raises(ValueError) as e:
        parse_filter_string("somekey=")
        assert "empty val" in str(e)


def test_resolve_direct_fields():
    parsed = {"name": ("name", None, "test"), "status": ("status", None, "active")}
    result = resolve_relation_filters(parsed, bt.Gene)
    assert result == {"name": "test", "status": "active"}


def test_resolve_relation_filter_with_uid():
    source = bt.Source(
        name="test_name",
        description="test_description",
        organism="human",
        entity="bionty.Gene",
        version="2026-01-01",
    )
    source.uid = "testuid1"
    source.save()
    parsed = {"source__uid": ("source", "uid", "testuid1")}
    result = resolve_relation_filters(parsed, bt.Gene)
    print(result)
    assert result == {"source": source}
    source.delete(permanent=True)


def test_resolve_relation_filter_with_name(organism):
    parsed = {"organism__name": ("organism", "name", "test_organism")}
    result = resolve_relation_filters(parsed, bt.Gene)
    assert result == {"organism": organism}
    organism.delete(permanent=True)


def test_resolve_multiple_relation_filters(organism):
    source = bt.Source(
        name="test_name",
        description="test_description",
        organism="human",
        entity="bionty.Gene",
        version="2026-01-01",
    )
    source.uid = "testuid1"
    source.save()
    parsed = {
        "organism__name": ("organism", "name", "test_organism"),
        "source__uid": ("source", "uid", "testuid1"),
    }
    result = resolve_relation_filters(parsed, bt.Gene)
    assert result == {"organism": organism, "source": source}
    source.delete(permanent=True)
    organism.delete(permanent=True)


def test_resolve_nested_filter(organism):
    parsed = {"organism__name__contains": ("organism", "name__contains", "test_orga")}
    result = resolve_relation_filters(parsed, bt.Gene)
    assert result == {"organism": organism}
    organism.delete(permanent=True)


def test_resolve_relation_filter_failed_resolution():
    parsed = {"organism__name": ("organism", "name", "nonexistent")}
    with pytest.raises(bt.Organism.DoesNotExist):
        resolve_relation_filters(parsed, bt.Gene)


def test_resolve_relation_filter_duplicate():
    parsed = {
        "source__uid": ("source", "uid", "testuid1"),
        "source__name": ("source", "name", "test_name"),
    }
    with pytest.raises(bt.Source.DoesNotExist):
        resolve_relation_filters(parsed, bt.Gene)


# -----------------------------------------------------------------------------
# backward compatibility for old format strings
# -----------------------------------------------------------------------------


def test_convert_old_format_ulabel_string():
    """Test converting old format ULabel string to object."""
    # Create a ULabel type
    perturbation = ln.ULabel(name="Perturbation", is_type=True).save()

    # Convert old format string
    dtype_obj = dtype_as_object("cat[ULabel[Perturbation]]", old_format=True)

    # Should return the ULabel object
    assert dtype_obj == perturbation
    assert hasattr(dtype_obj, "uid")

    # Clean up
    perturbation.delete(permanent=True)


def test_convert_old_format_record_string():
    """Test converting old format Record string to object."""
    # Create a Record type
    sample_type = ln.Record(name="Sample", is_type=True).save()

    # Convert old format string
    dtype_obj = dtype_as_object("cat[Record[Sample]]", old_format=True)

    # Should return the Record object
    assert dtype_obj == sample_type
    assert hasattr(dtype_obj, "uid")

    # Clean up
    sample_type.delete(permanent=True)


def test_convert_old_format_nested_record_string():
    """Test converting old format nested Record string to object."""
    # Create nested Record types
    lab_type = ln.Record(name="LabA", is_type=True).save()
    experiment_type = ln.Record(name="Experiment", type=lab_type, is_type=True).save()

    # Convert old format string
    dtype_obj = dtype_as_object("cat[Record[LabA[Experiment]]]", old_format=True)

    # Should return the nested Record object
    assert dtype_obj == experiment_type
    assert hasattr(dtype_obj, "uid")

    # Clean up
    experiment_type.delete(permanent=True)
    lab_type.delete(permanent=True)


def test_convert_old_format_list_string():
    """Test converting old format list string to object."""
    # Create a ULabel type
    perturbation = ln.ULabel(name="Perturbation", is_type=True).save()

    # Convert old format string with list wrapper
    dtype_obj = dtype_as_object("list[cat[ULabel[Perturbation]]]", old_format=True)

    # Should return list[ULabel] type
    assert hasattr(dtype_obj, "__origin__")
    assert dtype_obj.__origin__ is list
    # Get the inner type
    from typing import get_args

    inner_type = get_args(dtype_obj)[0]
    assert inner_type == perturbation

    # Clean up
    perturbation.delete(permanent=True)


def test_feature_constructor_with_old_format_string(ccaplog):
    """Test Feature constructor with old format string raises deprecation warning."""
    # Create a ULabel type
    perturbation = ln.ULabel(name="Perturbation", is_type=True).save()

    # Create feature with old format string
    feature = ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]")
    assert (
        "rather than passing a string 'cat[ULabel[Perturbation]]' to dtype, consider passing a Python object"
        in ccaplog.text
    )

    # Should have converted to UID format
    assert feature._dtype_str is not None
    assert "ULabel[" in feature._dtype_str
    # Should contain UID, not name
    assert "Perturbation" not in feature._dtype_str
    assert perturbation.uid in feature._dtype_str

    # Clean up
    perturbation.delete(permanent=True)


def test_feature_constructor_with_old_format_nested_string(ccaplog):
    """Test Feature constructor with old format nested string."""
    # Create nested Record types
    lab_type = ln.Record(name="LabA", is_type=True).save()
    experiment_type = ln.Record(name="Experiment", type=lab_type, is_type=True).save()

    # Create feature with old format nested string
    feature = ln.Feature(name="experiment", dtype="cat[Record[LabA[Experiment]]]")
    assert (
        "rather than passing a string 'cat[Record[LabA[Experiment]]]' to dtype, consider passing a Python object"
        in ccaplog.text
    )

    # Should have converted to UID format
    assert feature._dtype_str is not None
    assert "Record[" in feature._dtype_str
    # Should contain UID, not names
    assert "LabA" not in feature._dtype_str
    assert "Experiment" not in feature._dtype_str
    assert experiment_type.uid in feature._dtype_str

    # Clean up
    experiment_type.delete(permanent=True)
    lab_type.delete(permanent=True)


def test_bare_cat_dtype_backward_compatibility():
    """Test that bare 'cat' dtype is accepted for backward compatibility."""
    # Test parse_dtype accepts "cat" and returns empty list
    result = parse_dtype("cat")
    assert result == []

    # Test Feature constructor with bare "cat" dtype issues deprecation warning
    with pytest.warns(DeprecationWarning, match="dtype `cat` is deprecated"):
        feature = ln.Feature(name="test_bare_cat", dtype="cat")
    assert feature._dtype_str == "cat"


def test_migrate_dtype_to_uid_format():
    """Test migrate_dtype_to_uid_format() function for migration."""
    from django.db import connection
    from lamindb.models.feature import migrate_dtype_to_uid_format

    # Create Record types for testing
    lab_type = ln.Record(name="LabA", is_type=True).save()
    experiment_type = ln.Record(name="Experiment", type=lab_type, is_type=True).save()
    perturbation = ln.ULabel(name="Perturbation", is_type=True).save()

    # Create features with old format strings in _dtype_str
    feature1 = ln.Feature(name="test_record_old_format", dtype="str").save()
    feature2 = ln.Feature(name="test_ulabel_old_format", dtype="str").save()
    feature3 = ln.Feature(name="test_list_record_old_format", dtype="str").save()
    feature4 = ln.Feature(name="test_list_ulabel_old_format", dtype="str").save()

    # Manually set old format strings using raw SQL
    old_format_record = "cat[Record[LabA[Experiment]]]"
    old_format_ulabel = "cat[ULabel[Perturbation]]"
    old_format_list_record = "list[cat[Record[LabA[Experiment]]]]"
    old_format_list_ulabel = "list[cat[ULabel[Perturbation]]]"

    with connection.cursor() as cursor:
        cursor.execute(
            "UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s",
            [old_format_record, feature1.id],
        )
        cursor.execute(
            "UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s",
            [old_format_ulabel, feature2.id],
        )
        cursor.execute(
            "UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s",
            [old_format_list_record, feature3.id],
        )
        cursor.execute(
            "UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s",
            [old_format_list_ulabel, feature4.id],
        )

    # Refresh features from database
    feature1.refresh_from_db()
    feature2.refresh_from_db()
    feature3.refresh_from_db()
    feature4.refresh_from_db()

    # Verify old format is present
    assert feature1._dtype_str == old_format_record
    assert feature2._dtype_str == old_format_ulabel
    assert feature3._dtype_str == old_format_list_record
    assert feature4._dtype_str == old_format_list_ulabel

    # Run migration function
    migrate_dtype_to_uid_format(connection, input_field="_dtype_str")

    # Refresh features from database
    feature1.refresh_from_db()
    feature2.refresh_from_db()
    feature3.refresh_from_db()
    feature4.refresh_from_db()

    # Verify conversion to UID format
    assert feature1._dtype_str == f"cat[Record[{experiment_type.uid}]]"
    assert feature2._dtype_str == f"cat[ULabel[{perturbation.uid}]]"
    assert feature3._dtype_str == f"list[cat[Record[{experiment_type.uid}]]]"
    assert feature4._dtype_str == f"list[cat[ULabel[{perturbation.uid}]]]"

    # Verify old names are not in the converted strings
    assert "LabA" not in feature1._dtype_str
    assert "Experiment" not in feature1._dtype_str
    assert "Perturbation" not in feature2._dtype_str
    assert "LabA" not in feature3._dtype_str
    assert "Experiment" not in feature3._dtype_str
    assert "Perturbation" not in feature4._dtype_str

    # Verify UIDs are present
    assert experiment_type.uid in feature1._dtype_str
    assert perturbation.uid in feature2._dtype_str
    assert experiment_type.uid in feature3._dtype_str
    assert perturbation.uid in feature4._dtype_str

    # Clean up
    feature1.delete(permanent=True)
    feature2.delete(permanent=True)
    feature3.delete(permanent=True)
    feature4.delete(permanent=True)
    experiment_type.delete(permanent=True)
    lab_type.delete(permanent=True)
    perturbation.delete(permanent=True)


================================================
FILE: tests/core/test_from_values.py
================================================
import bionty as bt
import lamindb as ln
import pandas as pd
import pytest


@pytest.fixture(scope="module")
def df():
    return pd.DataFrame(
        (
            ["T cell", "CL:0000084"],
            ["hepatocyte", "CL:0000182"],
            ["my new cell type", ""],
        ),
        columns=["cell_type", "cell_type_id"],
    )


def test_from_values_name(df):
    bt.CellType.filter().delete(permanent=True)
    assert df["cell_type"].tolist() == ["T cell", "hepatocyte", "my new cell type"]
    # create records from bionty
    result = bt.CellType.from_values(df.cell_type, "name")
    ids = [i.ontology_id for i in result]
    assert len(result) == 2
    assert set(ids) == {"CL:0000084", "CL:0000182"}
    assert result[0].source.entity == "bionty.CellType"

    # wrong field type
    with pytest.raises(TypeError):
        result = bt.CellType.from_values(df.cell_type, field=bt.CellType)


def test_from_values_ontology_id(df):
    assert df["cell_type_id"].tolist() == ["CL:0000084", "CL:0000182", ""]
    result = bt.CellType.from_values(df.cell_type_id, "ontology_id")
    names = {i.name for i in result}
    assert len(result) == 2
    assert names == {"T cell", "hepatocyte"}
    assert result[0].source.entity == "bionty.CellType"


def test_from_values_multiple_match():
    records = bt.Gene.from_values(["ABC1", "PDCD1"], bt.Gene.symbol, organism="human")
    assert len(records) == 3


def test_get_or_create_records():
    names = ["record" + str(i) for i in range(25)]
    labels = [ln.Record(name=name) for name in names]
    ln.save(labels)
    # more than 20 existing values
    labels = ln.Record.from_values(names, field="name")
    assert len(labels) == 25


def test_from_values_synonyms_aware():
    bt.CellType.from_source(name="T cell").save()
    # existing validated values
    records = bt.CellType.from_values(["T cell"], "name")
    assert len(records) == 1
    assert records[0].name == "T cell"
    assert isinstance(records[0].source, bt.Source)
    # existing validated values and synonyms
    records = bt.CellType.from_values(["T cell", "T-cell"], "name")
    assert len(records) == 1
    assert records[0].name == "T cell"
    assert isinstance(records[0].source, bt.Source)
    # bionty values and synonyms
    records = bt.CellType.from_values(["B-cell", "B cell"], "name")
    assert len(records) == 1
    assert records[0].name == "B cell"
    assert isinstance(records[0].source, bt.Source)
    # all possibilities of validated values
    records = bt.CellType.from_values(
        ["T cell", "T-cell", "t cell", "B cell", "B-cell"], "name"
    )
    assert len(records) == 2
    names = [r.name for r in records]
    assert set(names) == {"T cell", "B cell"}
    assert isinstance(records[0].source, bt.Source)
    assert isinstance(records[1].source, bt.Source)
    # non-validated values
    records = bt.CellType.from_values(["T cell", "mycell"], "name")
    assert len(records) == 1
    assert records[0].name == "T cell"
    assert isinstance(records[0].source, bt.Source)
    assert records[0].ontology_id == "CL:0000084"
    bt.CellType.filter().delete(permanent=True)


def test_standardize():
    # only name field can be standardized
    results = bt.Gene.from_values(
        ["HES4", "TNFRSF4"], field=bt.Gene.ensembl_gene_id, organism="human"
    )
    assert len(results) == 0

    results = bt.Gene.from_values(
        ["HES4", "TNFRSF4"], field=bt.Gene.symbol, organism="human"
    )
    assert len(results) == 2


def test_from_values_no_source():
    # remove source of ExperimentalFactor
    source = bt.Source.filter(entity="bionty.ExperimentalFactor").first()
    source.delete(permanent=True)
    assert not bt.ExperimentalFactor.from_values(["scrnaseq"])
    source.save()


================================================
FILE: tests/core/test_has_parents.py
================================================
import bionty as bt
import lamindb as ln


def test_view_parents():
    label1 = ln.Record(name="label1")
    label2 = ln.Record(name="label2")
    label1.save()
    label2.save()
    label1.parents.add(label2)
    label1.view_parents(ln.Record.name, distance=1)
    label1.delete(permanent=True)
    label2.delete(permanent=True)


def test_query_parents_children():
    label1 = ln.Record(name="label1").save()
    label2 = ln.Record(name="label2").save()
    label3 = ln.Record(name="label3").save()
    label1.children.add(label2)
    label2.children.add(label3)
    parents = label3.query_parents()
    assert len(parents) == 2
    assert label1 in parents and label2 in parents
    children = label1.query_children()
    assert len(children) == 2
    assert label2 in children and label3 in children
    label1.delete(permanent=True)
    label2.delete(permanent=True)
    label3.delete(permanent=True)


def test_view_lineage_circular():
    import pandas as pd

    transform = ln.Transform(key="test").save()
    run = ln.Run(transform=transform).save()
    artifact = ln.Artifact.from_dataframe(
        pd.DataFrame({"a": [1, 2, 3]}), description="test artifact", run=run
    ).save()
    run.input_artifacts.add(artifact)
    artifact.view_lineage()
    artifact.delete(permanent=True)
    transform.delete(permanent=True)


def test_view_parents_connected_instance():
    ct = bt.CellType.connect("laminlabs/cellxgene").first()

    if ct and hasattr(ct, "parents"):
        ct.view_parents(distance=2, with_children=True)


def test_query_relatives_connected_instance():
    ct = bt.CellType.connect("laminlabs/cellxgene").filter(name="T cell").first()

    if ct:
        parents = ct.query_parents()
        assert parents.db == "laminlabs/cellxgene"

        children = ct.query_children()
        assert children.db == "laminlabs/cellxgene"


def test_view_lineage_connected_instance():
    af = ln.Artifact.connect("laminlabs/cellxgene").first()

    if af and af.run:
        af.view_lineage()


================================================
FILE: tests/core/test_has_type.py
================================================
import os

import lamindb as ln
import pytest
from django.db import IntegrityError


@pytest.mark.parametrize(
    "model_class,extra_kwargs",
    [
        (ln.Record, {}),
        (ln.Feature, {"dtype": "str"}),
        (ln.Schema, {"itype": ln.Feature}),
        (ln.Project, {}),
        (ln.Reference, {}),
        (ln.ULabel, {}),
    ],
)
def test_invalid_type(model_class, extra_kwargs):
    # also see test_invalid_type_record_with_schema in test_record.py
    model_name = model_class.__name__.lower()

    no_type = model_class(name="no_type", **extra_kwargs).save()
    if model_name == "schema":
        extra_kwargs["is_type"] = True  # to avoid triggering hash look up
    with pytest.raises(ValueError) as error:
        model_class(name="WithInvalidType", type=no_type, **extra_kwargs).save()
    assert error.exconly().startswith(
        f"ValueError: You can only assign a {model_name} with `is_type=True` as `type` to another {model_name}"
    )
    # test at the database level
    if os.getenv("LAMINDB_TEST_DB_VENDOR") != "sqlite":
        no_type.is_type = True
        with pytest.raises(IntegrityError) as error:
            model_class(name="WithInvalidType", type=no_type, **extra_kwargs).save()
        assert f"{model_name}_type_is_valid_fk" in error.exconly()
    no_type.delete(permanent=True)


@pytest.mark.skipif(
    os.getenv("LAMINDB_TEST_DB_VENDOR") == "sqlite", reason="Postgres-only"
)
@pytest.mark.parametrize("model_class", [ln.Record, ln.ULabel])
def test_prevent_type_cycle(model_class):
    type_a = model_class(name="TypeA", is_type=True).save()
    type_b = model_class(name="TypeB", is_type=True).save()

    # Set A's parent to B
    type_a.type = type_b
    type_a.save()  # A → B, this is fine

    # Try to set B's parent to A (would create cycle B → A → B)
    type_b.type = type_a

    with pytest.raises(Exception) as exc_info:
        type_b.save()

    assert "cycle" in str(exc_info.value).lower()

    # Try to set type to itself
    type_a.type = type_a

    with pytest.raises(Exception) as exc_info:
        type_a.save()

    assert "cycle" in str(exc_info.value).lower()

    type_a.delete(permanent=True)
    type_b.delete(permanent=True)


@pytest.mark.parametrize("model_class", [ln.Record, ln.ULabel, ln.Project])
def test_query_sub_types_super_types_instances(model_class):
    model_name = model_class.__name__.lower()

    # Create type hierarchy
    type1 = model_class(name="Type1", is_type=True).save()
    type2 = model_class(name="Type2", is_type=True, type=type1).save()
    type3 = model_class(name="Type3", is_type=True, type=type2).save()

    # Create instances
    instance1 = model_class(name=f"{model_name}1", type=type1).save()
    instance2 = model_class(name=f"{model_name}2", type=type3).save()
    instance3 = model_class(name=f"{model_name}3", type=type3).save()

    # Get the query method dynamically
    query_method = getattr(type1, f"query_{model_name}s")

    # Children
    assert getattr(type1, model_name + "s").count() == 2  # direct instances
    assert query_method().count() == 5

    # Super types
    super_types = instance3.query_types()
    assert len(super_types) == 3
    assert super_types[0] == type3
    assert super_types[1] == type2
    assert super_types[2] == type1

    # Move type2 to trash
    type2.delete()
    assert query_method().count() == 1

    # Cleanup
    instance1.delete(permanent=True)
    instance2.delete(permanent=True)
    instance3.delete(permanent=True)
    type3.delete(permanent=True)
    type2.delete(permanent=True)
    type1.delete(permanent=True)


================================================
FILE: tests/core/test_integrity.py
================================================
import lamindb_setup as ln_setup


def test_migrate_check():
    assert ln_setup.migrate.check()


def test_system_check():
    ln_setup.django("check")


================================================
FILE: tests/core/test_is_versioned.py
================================================
import lamindb as ln
import pandas as pd
import pytest
from lamindb.models._is_versioned import (
    _adjust_is_latest_when_deleting_is_versioned,
    bump_version,
    set_version,
)


@pytest.fixture(scope="module")
def df1():
    return pd.DataFrame({"feat1": [1, 2]})


@pytest.fixture(scope="module")
def df2():
    return pd.DataFrame({"feat1": [2, 3]})


def test_set_version():
    # all remaining lines are covered in notebooks
    with pytest.raises(ValueError):
        set_version(None, "weird-version")
    assert set_version(None, "1.2") == "2"
    assert set_version(None, "0") == "1"
    assert set_version(None, "1") == "2"
    assert set_version("1.2.3", "0") == "1.2.3"
    assert set_version("1.2.3") == "1.2.3"


def test_bump_version():
    current_version_major_only = "2"
    current_version_major_minor = "2.1"
    weird_version = "weird-version"
    with pytest.raises(ValueError):
        bump_version(weird_version)
    assert bump_version(weird_version, behavior="ignore") == "?"
    assert bump_version(current_version_major_only, bump_type="major") == "3"
    assert bump_version(current_version_major_only, bump_type="minor") == "2.1"
    assert bump_version(current_version_major_minor, bump_type="major") == "3"
    assert bump_version(current_version_major_minor, bump_type="minor") == "2.2"


def test_add_to_version_family(df1, df2):
    artifact1 = ln.Artifact.from_dataframe(df1, description="test1").save()
    artifact2 = ln.Artifact.from_dataframe(df2, description="test2").save()
    assert (
        artifact1.uid[: artifact1._len_stem_uid]
        != artifact2.uid[: artifact2._len_stem_uid]
    )
    artifact2._add_to_version_family(artifact1)
    assert (
        artifact1.uid[: artifact1._len_stem_uid]
        == artifact2.uid[: artifact2._len_stem_uid]
    )
    assert (
        artifact1.path.name[: artifact1._len_stem_uid]
        == artifact2.path.name[: artifact2._len_stem_uid]
    )
    artifact1.delete(permanent=True)
    artifact2.delete(permanent=True)


def test_transform_versioning_based_on_key():
    transform1 = ln.Transform(
        key="test-pipeline",
        version="1.0",
        source_code="1",
        kind="pipeline",
    ).save()
    assert transform1.is_latest
    assert transform1.version_tag == "1.0"
    assert transform1.version == "1.0"

    with pytest.raises(ValueError) as e:
        transform2 = ln.Transform(
            key="test-pipeline",
            version="1.0",
            source_code="2",
            kind="pipeline",
        ).save()
    assert (
        e.exconly()
        == "ValueError: Please change the version tag or leave it `None`, '1.0' is already taken"
    )

    transform2 = ln.Transform(
        key="test-pipeline",
        # do not pass the version tag, which corresponds to: version=None
        source_code="2",
        kind="pipeline",
    ).save()

    assert transform2.version_tag is None
    assert transform2.version == transform2.uid[-4:]  # version falls back to uid suffix
    assert transform2.is_latest
    assert transform2.hash != transform1.hash
    assert not ln.Transform.get(key="test-pipeline", version="1.0").is_latest

    transform3 = ln.Transform(
        key="test-pipeline",
        version="abcd",  # mimic commit hash
        source_code="3",
        kind="pipeline",
    ).save()

    assert transform3.version_tag == "abcd"
    assert transform3.version == "abcd"
    assert transform3.is_latest
    assert transform3.hash != transform2.hash
    assert not ln.Transform.get(key="test-pipeline", source_code="2").is_latest


def test_transform_versioning_based_on_revises():
    # build one version family
    transform_v1 = ln.Transform(key="Introduction").save()
    assert transform_v1.is_latest
    assert transform_v1.version_tag is None

    # pass the latest version
    transform_v2 = ln.Transform(
        key="Introduction v2", revises=transform_v1, version="2"
    ).save()
    assert not transform_v1.is_latest
    assert transform_v2.is_latest
    assert transform_v2.uid.endswith("0001")
    assert transform_v2.version_tag == "2"
    assert transform_v2.version == "2"

    # consciously *not* pass the latest version to revises but the previous
    # it automatically retrieves the latest version
    transform_v3 = ln.Transform(key="Introduction", revises=transform_v1).save()
    assert transform_v3.uid.endswith("0002")
    assert not ln.Transform.get(key="Introduction v2", version="2").is_latest
    assert transform_v3.is_latest
    # no source code code was yet saved, returning existing transform with same key
    transform_v4 = ln.Transform(key="Introduction").save()
    assert transform_v4 == transform_v3

    assert len(ln.Transform.filter(key="Introduction")) == 2
    assert len(ln.Transform.filter(key="Introduction").filter(is_latest=True)) == 1
    assert ln.Transform.get(key="Introduction") == transform_v3
    assert ln.Transform.filter(key="Introduction").get(is_latest=True) == transform_v3

    # test get
    assert ln.Transform.get(transform_v3.uid) == transform_v3
    assert ln.Transform.get(transform_v3.id) == transform_v3
    assert ln.Transform.get(transform_v3.uid[:-4]) == transform_v3

    # test empty QuerySet
    assert (
        ln.Transform.filter(key="IntroductionNotExists")
        .filter(is_latest=True)
        .one_or_none()
        is None
    )

    # test soft delete
    transform_v3.delete()
    assert transform_v2.is_latest

    # test hard delete
    transform_v2.delete(permanent=True)
    assert (
        transform_v1_retrieved := ln.Transform.get(transform_v3.uid[:-4])
    ) == transform_v1
    assert transform_v1_retrieved.is_latest

    # test soft delete on the last existing version does not change is_latest
    transform_v1_retrieved.delete()
    assert (
        transform_v1_retrieved := ln.Transform.get(transform_v1.uid)
    ) == transform_v1
    assert transform_v1_retrieved.is_latest

    # fully delete
    transform_v1.delete(permanent=True)

    # last object that exists is in the trash
    assert ln.Transform.get(transform_v3.uid[:-4]) == transform_v3
    assert transform_v3.branch_id == -1
    transform_v3.delete(permanent=True)


def test_transform_versioning_across_branches_preserves_main_latest():
    main_branch = ln.Branch.get(name="main")
    ln.setup.switch(main_branch.name)
    branch = ln.Branch(name="test_versioning_branch_latest").save()
    transform_v1 = ln.Transform(
        key="test-branch-aware-is-latest",
        source_code="main-v1",
        kind="pipeline",
    ).save()
    try:
        ln.setup.switch(branch.name)
        transform_v2 = ln.Transform(
            key="test-branch-aware-is-latest",
            revises=transform_v1,
            source_code="feature-v2",
            kind="pipeline",
        ).save()
        transform_v1.refresh_from_db()
        assert transform_v1.is_latest
        assert transform_v2.is_latest

        # Passing an older revises still increments from the family max uid.
        transform_v3 = ln.Transform(
            key="test-branch-aware-is-latest",
            revises=transform_v1,
            source_code="feature-v3",
            kind="pipeline",
        ).save()
        transform_v2.refresh_from_db()
        transform_v1.refresh_from_db()
        assert transform_v3.uid.endswith("0002")
        assert not transform_v2.is_latest
        assert transform_v3.is_latest
        assert transform_v1.is_latest
    finally:
        ln.setup.switch(main_branch.name)
        for uid in (transform_v1.uid[:-4],):
            for record in ln.Transform.objects.filter(uid__startswith=uid):
                record.delete(permanent=True)
        branch.delete(permanent=True)


def test_path_rename():
    # this is related to renames inside _add_to_version_family
    with open("test_new_path.txt", "w") as f:
        f.write("test_new_path")
    old_path = ln.UPath("s3://lamindata/.lamindb/test_new_path.txt")
    old_path.upload_from("./test_new_path.txt")
    assert old_path.exists()
    new_path = old_path.rename(old_path.with_name("test_new_path2.txt"))
    assert new_path.exists()
    assert new_path.as_posix() == "s3://lamindata/.lamindb/test_new_path2.txt"
    assert not old_path.exists()
    new_path.unlink()
    ln.UPath("./test_new_path.txt").unlink()


def test_version_backward_compatibility():
    """Test that queries using version= still work (backward compatibility)."""
    # Create transforms with different versions and source_code to avoid deduplication
    transform1 = ln.Transform(
        key="test-backward-compat",
        version="1.0",
        kind="pipeline",
        source_code="code1",
    ).save()
    transform2 = ln.Transform(
        key="test-backward-compat",
        version="2.0",
        kind="pipeline",
        source_code="code2",
    ).save()

    # Test that we can query using version= (old API)
    found = ln.Transform.get(key="test-backward-compat", version="1.0")
    assert found == transform1
    assert found.version_tag == "1.0"
    assert found.version == "1.0"

    found = ln.Transform.get(key="test-backward-compat", version="2.0")
    assert found == transform2
    assert found.version_tag == "2.0"
    assert found.version == "2.0"

    # Test filter with version=
    results = ln.Transform.filter(key="test-backward-compat", version="1.0")
    assert len(results) == 1
    assert results.first() == transform1

    # Test with Artifact
    artifact1 = ln.Artifact.from_dataframe(
        pd.DataFrame({"col1": [1, 2]}), key="test-artifact.parquet", version="1.0"
    ).save()
    artifact2 = ln.Artifact.from_dataframe(
        pd.DataFrame({"col1": [3, 4]}), key="test-artifact.parquet", version="2.0"
    ).save()

    found_artifact = ln.Artifact.get(key="test-artifact.parquet", version="1.0")
    assert found_artifact == artifact1
    assert found_artifact.version_tag == "1.0"
    assert found_artifact.version == "1.0"

    found_artifact = ln.Artifact.get(key="test-artifact.parquet", version="2.0")
    assert found_artifact == artifact2
    assert found_artifact.version_tag == "2.0"
    assert found_artifact.version == "2.0"

    # Cleanup
    transform1.delete(permanent=True)
    transform2.delete(permanent=True)
    artifact1.delete(permanent=True)
    artifact2.delete(permanent=True)


def test_adjust_is_latest_when_deleting_is_versioned():
    """Direct unit test for _adjust_is_latest_when_deleting_is_versioned (covers multiple promoted)."""
    # Build two version families, each with v1 (older) and v2 (latest)
    v1a = ln.Transform(key="Adjust latest family A").save()
    v2a = ln.Transform(revises=v1a, key="Adjust latest family A").save()
    v1b = ln.Transform(key="Adjust latest family B").save()
    v2b = ln.Transform(revises=v1b, key="Adjust latest family B").save()
    assert v2a.is_latest and v2b.is_latest
    assert not v1a.is_latest and not v1b.is_latest

    # Delete both latest → two promoted (covers "new latest ... versions: [...]" branch)
    promoted = _adjust_is_latest_when_deleting_is_versioned([v2a, v2b])
    assert len(promoted) == 2
    assert set(promoted) == {v1a.pk, v1b.pk}

    v1a.refresh_from_db()
    v1b.refresh_from_db()
    assert v1a.is_latest and v1b.is_latest

    # Edge case: empty list returns []
    assert _adjust_is_latest_when_deleting_is_versioned([]) == []

    # Clean up
    v2a.delete(permanent=True)
    v2b.delete(permanent=True)
    v1a.delete(permanent=True)
    v1b.delete(permanent=True)


================================================
FILE: tests/core/test_label_manager.py
================================================
from pathlib import Path

import bionty as bt
import lamindb as ln
import pytest
from _dataset_fixtures import (  # noqa
    get_mini_csv,
)
from lamindb.errors import ValidationError
from lamindb.models.artifact import add_labels


@pytest.fixture(scope="module")
def adata():
    adata = ln.examples.datasets.anndata_with_obs()
    # add another column
    adata.obs["cell_type_by_expert"] = adata.obs["cell_type"]
    adata.obs.loc["obs0", "cell_type_by_expert"] = "B cell"
    return adata


def test_labels_add(adata):
    label = ln.Record(name="Experiment 1")
    artifact = ln.Artifact.from_anndata(adata, description="test").save()
    experiment = ln.Feature(name="experiment", dtype=ln.Record)
    with pytest.raises(ValueError) as error:
        artifact.labels.add("experiment_1", experiment)
    assert (
        error.exconly()
        == "ValueError: Please pass a record (a `SQLRecord` object), not a string, e.g.,"
        " via: label = ln.Record(name='experiment_1')"
    )
    with pytest.raises(ValidationError) as error:
        artifact.labels.add(label, experiment)
    assert "not validated. If it looks correct: record.save()" in error.exconly()
    label.save()
    with pytest.raises(TypeError) as error:
        artifact.labels.add(label, "experiment 1")
    with pytest.raises(ValidationError) as error:
        artifact.labels.add(label, feature=experiment)
    assert (
        error.exconly()
        == "lamindb.errors.ValidationError: Feature not validated. If it looks"
        " correct: ln.Feature(name='experiment', type='cat[Record]').save()"
    )
    experiment.save()

    # try to pass list of length zero
    artifact.labels.add([], feature=experiment)
    # now pass a single label
    artifact.labels.add(label, feature=experiment)
    # check that the feature was updated with type = "Record"
    feature = ln.Feature.get(name="experiment")
    assert feature._dtype_str == "cat[Record]"
    with pytest.raises(TypeError):
        experiments = artifact.labels.get("experiment")
    # check that the label is there, it's exactly one label with name "Experiment 1"
    experiments = artifact.labels.get(experiment)
    assert experiments.one().name == "Experiment 1"

    # try adding the same label again, nothing should happen
    artifact.labels.add(label, feature=experiment)
    # check that the label is there, it's exactly one label with name "Experiment 1"
    experiments = artifact.labels.get(experiment)
    assert experiments.get().name == "Experiment 1"

    # running from_values to load validated label records under the hood
    experiment = ln.Feature(name="experiment_with_reg", dtype="cat[Record]").save()
    ln.Record(name="Experiment 2").save()
    artifact.labels.add("Experiment 2", experiment)
    experiments = artifact.labels.get(experiment)
    assert experiments.get().name == "Experiment 2"

    # now, try adding a new label
    project = ln.Record(name="project 1").save()
    ln.Feature(name="project", dtype=ln.Record).save()
    features = ln.Feature.lookup()
    artifact.labels.add(project, feature=features.project)
    # check that the label is there, it's exactly one label with name "Experiment 1"
    projects = artifact.labels.get(features.project)
    assert projects.get().name == "project 1"

    # test add_from
    adata2 = adata.copy()
    adata2.uns["mutated"] = True
    artifact2 = ln.Artifact(adata2, description="My new artifact").save()

    artifact2.labels.add_from(artifact)
    experiments = artifact2.labels.get(experiment)
    assert experiments.get().name == "Experiment 2"

    artifact2.delete(permanent=True)
    artifact.delete(permanent=True)
    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)
    ln.Record.filter().delete(permanent=True)


def test_labels_add_using_anndata(adata):
    organism = bt.Organism.from_source(name="mouse")
    cell_types = [bt.CellType(name=name) for name in adata.obs["cell_type"].unique()]
    ln.save(cell_types)
    inspector = bt.CellType.inspect(adata.obs["cell_type_by_expert"].unique())
    ln.save([bt.CellType(name=name) for name in inspector.non_validated])
    cell_types_from_expert = bt.CellType.from_values(
        adata.obs["cell_type_by_expert"].unique()
    )
    actual_tissues = [bt.Tissue(name=name) for name in adata.obs["tissue"].unique()]
    organoid = ln.Record(name="organoid")
    tissues = actual_tissues + [organoid]
    ln.save(tissues)

    # clean up DB state
    organism_feature = ln.Feature.filter(name="organism").one_or_none()
    if organism_feature is not None:
        organism_feature.delete(permanent=True)
    artifact = ln.Artifact.filter(description="Mini adata").one_or_none()
    if artifact is not None:
        artifact.delete(permanent=True, storage=True)
    ln.Schema.filter().delete(permanent=True)

    # try to construct without registering metadata features
    artifact = ln.Artifact.from_anndata(adata, description="Mini adata")
    if not artifact._state.adding:
        artifact.delete(permanent=True)  # make sure we get a fresh one
        artifact = ln.Artifact.from_anndata(adata, description="Mini adata")
    # add feature set without saving file
    feature_name_feature = ln.Feature(name="feature name", dtype="cat[Record]").save()
    schema = ln.Schema(features=[feature_name_feature])
    with pytest.raises(ValueError) as error:
        artifact.features._add_schema(schema, slot="random")
    assert (
        error.exconly()
        == "ValueError: Please save the artifact or collection before adding a feature"
        " set!"
    )

    # now register features we want to validate
    # (we are not interested in cell_type_id, here)
    ln.Feature(name="cell_type", dtype=bt.CellType).save()
    ln.Feature(name="disease", dtype=ln.Record).save()
    ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save()
    artifact = ln.Artifact.from_anndata(adata, description="Mini adata")
    ln.Feature(name="organism", dtype=bt.Organism).save()
    features = ln.Feature.lookup()
    with pytest.raises(ValueError) as error:
        artifact.labels.add(organism, feature=features.organism)
    assert (
        error.exconly()
        == "ValueError: Please save the artifact/collection before adding a label!"
    )
    artifact.save()

    # now, we add organism and run checks
    features = ln.Feature.lookup()
    with pytest.raises(ln.errors.ValidationError):
        artifact.labels.add(organism, feature=features.organism)
    organism.save()
    artifact.labels.add(organism, feature=features.organism)
    organism_link = artifact.links_organism.first()
    assert organism_link.organism.name == "mouse"
    assert organism_link.feature.name == "organism"
    feature = ln.Feature.get(name="organism")
    assert feature._dtype_str == "cat[bionty.Organism]"

    # now we add cell types & tissues and run checks
    ln.Feature(name="cell_type", dtype=bt.CellType).save()
    ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save()
    add_labels(artifact, cell_types, feature=features.cell_type, from_curator=True)
    add_labels(
        artifact,
        cell_types_from_expert,
        feature=features.cell_type_by_expert,
        from_curator=True,
    )
    feature_tissue_simple = ln.Feature(name="tissue_simple", dtype=bt.Tissue).save()
    with pytest.raises(ValidationError) as err:
        add_labels(artifact, tissues, feature=feature_tissue_simple, from_curator=True)
    assert (
        err.exconly()
        == "lamindb.errors.ValidationError: Label type Record is not valid for Feature(name='tissue_simple', dtype='cat[bionty.Tissue]'), consider a feature with dtype='cat[bionty.Tissue|Record]'"
    )
    tissue = ln.Feature(name="tissue", dtype="cat[bionty.Tissue|Record]").save()
    add_labels(artifact, tissues, feature=tissue, from_curator=True)
    feature = ln.Feature.get(name="cell_type")
    assert feature._dtype_str == "cat[bionty.CellType]"
    feature = ln.Feature.get(name="cell_type_by_expert")
    assert feature._dtype_str == "cat[bionty.CellType]"
    feature = ln.Feature.get(name="tissue")
    assert feature._dtype_str == "cat[bionty.Tissue|Record]"
    diseases = [ln.Record(name=name) for name in adata.obs["disease"].unique()]
    ln.save(diseases)
    add_labels(artifact, diseases, feature=features.disease, from_curator=True)

    # now, let's add another feature to ext
    experiment_1 = ln.Record(name="experiment_1").save()
    ln.Feature(name="experiment", dtype=ln.Record).save()
    features = ln.Feature.lookup()
    artifact.labels.add(experiment_1, feature=features.experiment)

    assert set(artifact.labels.get(features.experiment).to_list("name")) == {
        "experiment_1"
    }
    assert set(artifact.labels.get(features.disease).to_list("name")) == {
        "chronic kidney disease",
        "Alzheimer disease",
        "liver lymphoma",
        "cardiac ventricle disorder",
    }
    assert set(artifact.labels.get(features.organism).to_list("name")) == {"mouse"}
    assert set(
        artifact.labels.get(features.tissue)["bionty.Tissue"].to_list("name")
    ) == {
        "liver",
        "heart",
        "kidney",
        "brain",
    }
    assert set(artifact.labels.get(features.tissue)["Record"].to_list("name")) == {
        "organoid",
    }
    # currently, we can't stratify the two cases below
    assert set(artifact.labels.get(features.cell_type).to_list("name")) == {
        "T cell",
        "my new cell type",
        "hepatocyte",
        "hematopoietic stem cell",
        "B cell",
    }
    assert set(artifact.labels.get(features.cell_type, flat_names=True)) == {
        "T cell",
        "my new cell type",
        "hepatocyte",
        "hematopoietic stem cell",
        "B cell",
    }
    assert set(artifact.labels.get(features.cell_type_by_expert).to_list("name")) == {
        "T cell",
        "my new cell type",
        "hepatocyte",
        "hematopoietic stem cell",
        "B cell",
    }
    assert experiment_1 in artifact.records.all()

    # call describe
    artifact.describe()

    # clean up
    artifact.delete(permanent=True)
    bt.Gene.filter().delete(permanent=True)
    bt.Organism.filter().delete(permanent=True)
    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)
    bt.CellType.filter().delete(permanent=True)
    bt.Tissue.filter().delete(permanent=True)
    bt.Disease.filter().delete(permanent=True)
    ln.Record.filter().delete(permanent=True)


def test_labels_get(get_mini_csv: Path):  # noqa: F811
    artifact = ln.Artifact(get_mini_csv, description="test")
    # feature doesn't exist
    with pytest.raises(TypeError):
        artifact.labels.get("x")  # type: ignore
    # no linked labels
    feature_name_feature = ln.Feature(name="feature name", dtype=ln.ULabel).save()
    schema = ln.Schema(features=[feature_name_feature]).save()
    artifact.save()
    # test for deprecated add_schema
    artifact.features._add_schema(schema, slot="random")
    assert artifact.schemas.first() == schema
    artifact.delete(permanent=True, storage=True)
    schema.delete(permanent=True)
    feature_name_feature.delete(permanent=True)


@pytest.fixture
def get_test_artifacts():
    with open("./default_storage_unit_core/test-inherit1", "w") as f:
        f.write("artifact1")
    with open("./default_storage_unit_core/test-inherit2", "w") as f:
        f.write("artifact2")
    artifact1 = ln.Artifact("./default_storage_unit_core/test-inherit1")
    artifact1.save()
    artifact2 = ln.Artifact("./default_storage_unit_core/test-inherit2")
    artifact2.save()
    yield artifact1, artifact2
    artifact1.delete(permanent=True, storage=True)
    artifact2.delete(permanent=True, storage=True)


def test_add_from(get_test_artifacts):
    artifact1, artifact2 = get_test_artifacts
    label_names = [f"Project {i}" for i in range(3)]
    records = [ln.Record(name=label_name) for label_name in label_names]
    ln.save(records)

    cell_line_names = [f"Cell line {i}" for i in range(3)]
    cell_lines = [bt.CellLine(name=name) for name in cell_line_names]
    ln.save(cell_lines)

    # pass a list of length 0
    artifact2.labels.add([])
    # now actually pass the labels
    artifact2.labels.add(records)
    # here test add without passing a feature
    artifact2.labels.add(cell_lines)
    assert artifact2.cell_lines.count() == len(cell_lines)

    assert artifact1.records.exists() is False
    artifact1.labels.add_from(artifact2)
    assert artifact1.records.count() == artifact2.records.count()
    assert artifact1.cell_lines.count() == artifact2.cell_lines.count()

    artifact2.cell_lines.remove(*cell_lines)
    artifact1.cell_lines.remove(*cell_lines)
    artifact2.records.remove(*records)
    artifact1.records.remove(*records)

    for record in records:
        record.delete(permanent=True)
    for cell_line in cell_lines:
        cell_line.delete(permanent=True)


================================================
FILE: tests/core/test_load.py
================================================
from pathlib import Path

import anndata as ad
import lamindb as ln
import pandas as pd
import pytest

# ruff: noqa: F811
from _dataset_fixtures import get_small_mdata, get_small_sdata  # noqa


@pytest.fixture(scope="module")
def zip_file():
    filepath = Path("test.zip")
    with open(filepath, "w") as f:
        f.write("some")
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="module")
def html_filepath():
    filepath = Path("./tmp.html")
    with open(filepath, "w") as f:
        f.write("<html><body><h1>Test</h1></body></html>")
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="module")
def json_filepath():
    filepath = Path("./tmp.json")
    with open(filepath, "w") as f:
        f.write('{"a": 1}')
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="module")
def csv_filepath():
    filepath = Path("./tmp.csv")
    with open(filepath, "w") as f:
        f.write("a,b\n1,2")
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="module")
def tsv_filepath():
    filepath = Path("./tmp.tsv")
    with open(filepath, "w") as f:
        f.write("a\tb\n1\t2")
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="module")
def parquet_filepath():
    filepath = Path("./tmp.parquet")
    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
    df.to_parquet(filepath)
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="module")
def yaml_filepath():
    filepath = Path("./tmp.yaml")
    with open(filepath, "w") as f:
        f.write("a: 1\nb: 2")
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="module")
def image_filepath():
    filepath = Path("./tmp.png")
    with open(filepath, "w") as f:
        f.write("mock image")
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="module")
def svg_filepath():
    filepath = Path("./tmp.svg")
    with open(filepath, "w") as f:
        f.write("<svg><rect width='100' height='100'/></svg>")
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="module")
def rds_filepath():
    filepath = Path("./tmp.rds")
    with open(filepath, "w") as f:
        f.write("mock rds")
    yield filepath
    filepath.unlink()


@pytest.fixture(scope="module")
def local_anndata_filepath():
    return ln.examples.datasets.anndata_file_pbmc68k_test().resolve()


@pytest.fixture(scope="module")
def adata(local_anndata_filepath):
    return ad.read_h5ad(local_anndata_filepath)


def test_load_anndata(local_anndata_filepath, adata):
    artifact = ln.Artifact(local_anndata_filepath, description="test")
    assert local_anndata_filepath == artifact._local_filepath
    assert local_anndata_filepath == artifact.path
    assert local_anndata_filepath == artifact.cache()

    artifact = ln.Artifact.from_anndata(adata, description="test")
    assert artifact._memory_rep is adata
    assert artifact.load() is adata
    assert artifact._local_filepath.resolve() == artifact.cache() == artifact.path


def test_load_mudata(get_small_mdata):
    artifact = ln.Artifact.from_mudata(get_small_mdata, description="test")
    assert artifact._memory_rep is get_small_mdata
    assert artifact.load() is get_small_mdata
    assert artifact._local_filepath.resolve() == artifact.cache() == artifact.path


def test_load_spatialdata(get_small_sdata):
    artifact = ln.Artifact.from_spatialdata(get_small_sdata, description="test")
    assert artifact._memory_rep is get_small_sdata
    assert artifact.load() is get_small_sdata
    assert artifact._local_filepath.resolve() == artifact.cache() == artifact.path


def load_blobs__repr__():
    example_blobs_sdata = ln.examples.datasets.spatialdata_blobs()
    blobs_af = ln.Artifact.from_spatialdata(
        example_blobs_sdata, key="example_blobs.zarr"
    ).save()
    example_blobs_sdata = blobs_af.load()
    # Must exist and not throw errors
    assert example_blobs_sdata.__repr__


def test_load_html(html_filepath):
    artifact = ln.Artifact(html_filepath, key=str(html_filepath))
    artifact.load()


def test_load_json(json_filepath):
    artifact = ln.Artifact(json_filepath, key=str(json_filepath))
    dictionary = artifact.load()
    assert dictionary["a"] == 1


def test_no_loader(zip_file):
    artifact = ln.Artifact(zip_file, key=str(zip_file))
    with pytest.raises(NotImplementedError):
        artifact.load()


def test_load_csv(csv_filepath):
    artifact = ln.Artifact(csv_filepath, key=str(csv_filepath))
    df = artifact.load()
    assert df.iloc[0, 0] == 1
    assert df.iloc[0, 1] == 2


def test_load_tsv(tsv_filepath):
    artifact = ln.Artifact(tsv_filepath, key=str(tsv_filepath))
    df = artifact.load()
    assert df.iloc[0, 0] == 1
    assert df.iloc[0, 1] == 2


def test_load_parquet(parquet_filepath):
    artifact = ln.Artifact(parquet_filepath, key=str(parquet_filepath))
    df = artifact.load()
    assert df.iloc[0, 0] == 1
    assert df.iloc[1, 1] == 4


def test_load_yaml(yaml_filepath):
    artifact = ln.Artifact(yaml_filepath, key=str(yaml_filepath))
    data = artifact.load()
    assert data["a"] == 1
    assert data["b"] == 2


def test_load_image(image_filepath):
    artifact = ln.Artifact(image_filepath, key=str(image_filepath))
    result = artifact.load()
    assert Path(result).name == image_filepath.name


def test_load_svg(svg_filepath):
    artifact = ln.Artifact(svg_filepath, key=str(svg_filepath))
    result = artifact.load()
    assert Path(result).name == svg_filepath.name


def test_load_rds(rds_filepath, ccaplog):
    artifact = ln.Artifact(rds_filepath, key=str(rds_filepath))
    result = artifact.load()
    assert "Please use `laminr` to load `.rds` files" in ccaplog.text
    assert Path(result).name == rds_filepath.name


================================================
FILE: tests/core/test_manager.py
================================================
import lamindb as ln


def test_manager_list():
    label = ln.Record(name="manager label")
    label.save()
    label_names = [f"Record {i}" for i in range(3)]
    labels = [ln.Record(name=name) for name in label_names]
    ln.save(labels)
    label.parents.set(labels)
    assert len(label.parents.to_list()) == 3
    assert "Record 1" in label.parents.to_list("name")
    label.delete(permanent=True)
    for label in labels:
        label.delete(permanent=True)


================================================
FILE: tests/core/test_merge.py
================================================
"""Tests for ln.setup.merge."""

import lamindb as ln
import pytest


def test_merge_branch_into_main():
    """Merge a branch into main: create branch, add ULabel, switch to main, merge."""
    branch = ln.Branch(name="test_merge_branch").save()
    assert branch.status == "standalone"
    ln.setup.switch(branch.name)
    assert ln.setup.settings.branch == branch
    assert ln.setup.settings.branch.name == "test_merge_branch"

    ulabel = ln.ULabel(name="test_merge_record").save()
    assert ulabel.branch == branch
    assert ulabel.created_on == branch  # created_on set to creation branch

    ln.setup.switch("main")
    assert ln.setup.settings.branch.name == "main"
    assert ln.setup.settings.branch.status == "standalone"
    assert ln.ULabel.filter(name="test_merge_record").count() == 0

    ln.setup.merge("test_merge_branch")
    assert ln.ULabel.filter(name="test_merge_record").count() == 1
    ulabel = ln.ULabel.get(name="test_merge_record")
    assert ulabel.branch.name == "main"
    # created_on still points to the branch on which the record was created
    assert ulabel.created_on == branch
    assert ulabel.created_on.name == "test_merge_branch"
    # merged branch has status "merged"
    branch.refresh_from_db()
    assert branch.status == "merged"
    # this is a merge call to check that branch.describe() works because it
    # has a custom describe method
    branch.describe(return_str=True)

    # Clean up
    ulabel.delete(permanent=True)
    branch.delete(permanent=True)
    ln.setup.switch("main")


def test_branch_status_values():
    """Branch status maps codes onto standalone/draft/review/merged/closed."""
    main_branch = ln.Branch.get(name="main")
    assert main_branch.status == "standalone"
    archive_branch = ln.Branch.get(name="archive")
    assert archive_branch.status == "standalone"
    trash_branch = ln.Branch.get(name="trash")
    assert trash_branch.status == "standalone"
    # User-created branch is standalone by default.
    branch = ln.Branch(name="test_status_branch").save()
    assert branch.status == "standalone"
    branch.status = "draft"
    branch.save()
    branch.refresh_from_db()
    assert branch.status == "draft"
    branch.status = "review"
    branch.save()
    branch.refresh_from_db()
    assert branch.status == "review"
    branch.status = "closed"
    branch.save()
    branch.refresh_from_db()
    assert branch.status == "closed"
    branch.delete(permanent=True)


def test_draft_review_and_close_merge_request_status():
    branch = ln.Branch(name="test_mr_draft_review_close").save()
    assert branch.status == "standalone"

    branch.status = "draft"
    branch.save()
    branch.refresh_from_db()
    assert branch.status == "draft"

    branch.status = "review"
    branch.save()
    branch.refresh_from_db()
    assert branch.status == "review"

    branch.status = "closed"
    branch.save()
    branch.refresh_from_db()
    assert branch.status == "closed"

    branch.delete(permanent=True)


def test_merge_nonexistent_branch_raises():
    """Merge a non-existent branch raises ObjectDoesNotExist."""
    with pytest.raises(ln.errors.ObjectDoesNotExist) as exc_info:
        ln.setup.merge("nonexistent_branch_xyz")
    assert "not found" in str(exc_info.value).lower()


def test_merge_reconciles_is_latest_for_versioned_records():
    main_branch = ln.Branch.get(name="main")
    ln.setup.switch(main_branch.name)

    transform_v1 = ln.Transform(
        key="test-merge-is-latest",
        source_code="main-v1",
        kind="pipeline",
    ).save()
    branch = ln.Branch(name="test_merge_latest_branch").save()
    ln.setup.switch(branch.name)
    transform_v2 = ln.Transform(
        key="test-merge-is-latest",
        revises=transform_v1,
        source_code="feature-v2",
        kind="pipeline",
    ).save()
    transform_v1.refresh_from_db()
    assert transform_v1.is_latest
    assert transform_v2.is_latest

    ln.setup.switch(main_branch.name)
    ln.setup.merge(branch.name)

    family = ln.Transform.objects.filter(
        uid__startswith=transform_v1.uid[:-4], branch_id=1
    )
    assert family.filter(is_latest=True).count() == 1
    assert family.get(is_latest=True).uid == transform_v2.uid

    for record in family:
        record.delete(permanent=True)
    branch.delete(permanent=True)


def test_merge_updates_recordblock_branch():
    main_branch = ln.Branch.get(name="main")
    ln.setup.switch(main_branch.name)

    source_branch = ln.Branch(name="test_merge_recordblock_branch").save()
    ln.setup.switch(source_branch.name)
    record = ln.Record(name="recordblock-merge-record").save()
    block = ln.models.RecordBlock(
        record=record,
        content="recordblock merge content",
        kind="readme",
        branch=source_branch,
        created_on=source_branch,
    ).save()
    assert block.branch == source_branch
    assert block.created_on == source_branch

    ln.setup.switch(main_branch.name)
    ln.setup.merge(source_branch.name)

    block.refresh_from_db()
    assert block.branch.name == "main"
    assert block.created_on == source_branch

    record.delete(permanent=True)
    source_branch.delete(permanent=True)


================================================
FILE: tests/core/test_nbconvert.py
================================================
import os


def test_nbconvert():
    exit_code = os.system(  # noqa: S605
        "jupyter nbconvert --to notebook --inplace --execute ./tests/core/notebooks/load_schema.ipynb"
    )
    assert exit_code == 0


================================================
FILE: tests/core/test_notebooks.py
================================================
import os
import subprocess
from pathlib import Path

import lamindb as ln
import nbproject_test

notebook_dir = Path(__file__).parent / "notebooks/"
notebook_dir_duplicate = Path(__file__).parent / "notebooks/duplicate/"


def test_all_notebooks():
    nbproject_test.execute_notebooks(notebook_dir)
    nbproject_test.execute_notebooks(notebook_dir_duplicate)


def test_run_after_rename_no_uid():
    notebook_path = (
        notebook_dir / "with-title-initialized-consecutive-finish-not-last-cell.ipynb"
    )
    result = subprocess.run(  # noqa: S602
        f"jupyter nbconvert --to notebook --inplace --execute {notebook_path}",
        shell=True,
        capture_output=True,
    )
    print(result.stdout.decode())
    print(result.stderr.decode())
    assert result.returncode == 0

    uid = ln.Transform.get(
        key="with-title-initialized-consecutive-finish-not-last-cell.ipynb"
    ).uid

    # now, assume the user renames the notebook
    new_path = notebook_path.with_name("no-uid-renamed.ipynb")
    os.system(f"cp {notebook_path} {new_path}")  # noqa: S605

    result = subprocess.run(  # noqa: S602
        f"jupyter nbconvert --to notebook --inplace --execute {new_path}",
        shell=True,
        capture_output=True,
    )
    print(result.stdout.decode())
    print(result.stderr.decode())
    assert result.returncode == 0

    assert ln.Transform.get(key="no-uid-renamed.ipynb").uid == uid

    # new_path.unlink()


================================================
FILE: tests/core/test_querydb.py
================================================
import lamindb as ln
import pytest


def test_DB_multiple_instances():
    """Accessing multiple instances simultaneously must work."""
    cxg_db = ln.DB("laminlabs/cellxgene")
    lamindata_db = ln.DB("laminlabs/lamindata")
    qs1 = cxg_db.Artifact.filter(suffix=".h5ad")
    qs2 = lamindata_db.Artifact.filter(suffix=".zarr")
    assert qs1._db != qs2._db


def test_DB_bionty():
    """Querying a record from bionty must work."""
    cxg_db = ln.DB("laminlabs/cellxgene")
    assert len(cxg_db.bionty.Gene.filter(symbol__startswith="TP53")) > 0


def test_DB_missing_module():
    """Attempting to access an attribute that comes from a missing module must error."""
    site_assets_db = ln.DB("laminlabs/lamin-site-assets")  # instance without bionty

    with pytest.raises(AttributeError) as e:
        site_assets_db.bionty.Gene.first()

    assert (
        "Schema 'bionty' not available in instance 'laminlabs/lamin-site-assets'."
        in str(e.value)
    )


def test_DB_instantiate_class():
    """Attempting to instantiate a class must error."""
    cxg_db = ln.DB("laminlabs/cellxgene")
    with pytest.raises(TypeError) as e:
        cxg_db.Artifact()
    assert (
        "Cannot instantiate Artifact from DB. Use Artifact.filter(), Artifact.get(), etc. to query records."
        in str(e.value)
    )


@pytest.mark.parametrize(
    "attr,expected_msg",
    [
        ("artifacts", "Registry 'artifacts' not found"),
        ("foo", "Registry 'foo' not found"),
        ("celltype", "Registry 'celltype' not found"),
    ],
)
def test_DB_rejects_invalid_attributes(attr, expected_msg):
    """Accessing invalid attributes must fail."""
    cxg_db = ln.DB("laminlabs/cellxgene")
    with pytest.raises(AttributeError) as e:
        getattr(cxg_db, attr)
    assert expected_msg in str(e.value)


def test_DB_cache():
    """Subsequent accesses must return cached wrapper."""
    cxg_db = ln.DB("laminlabs/cellxgene")
    artifact1 = cxg_db.Artifact
    artifact2 = cxg_db.Artifact
    assert artifact1 is artifact2


def test_queryset_caching():
    """Calling `.filter()` multiple times should return different results."""
    cxg_db = ln.DB("laminlabs/cellxgene")
    res_1 = cxg_db.Artifact.filter().first()
    res_2 = cxg_db.Artifact.filter().last()

    assert res_1 != res_2


def test_DB_dir():
    """__dir__ must return discovered registries."""
    cxg = ln.DB("laminlabs/cellxgene")
    dir_result = dir(cxg)
    assert "Artifact" in dir_result
    assert "Collection" in dir_result
    assert "Gene" not in dir_result
    assert "bionty" in dir_result


================================================
FILE: tests/core/test_queryset.py
================================================
import re
import textwrap
from contextlib import contextmanager

import bionty as bt
import lamindb as ln
import pytest
from django.core.exceptions import FieldError
from lamindb.base.users import current_user_id
from lamindb.errors import InvalidArgument
from lamindb.models import ArtifactSet, BasicQuerySet, QuerySet


# please also see the test_curate_df.py tests
def test_to_dataframe():
    project_label = ln.Record(name="project").save()
    project_names = [f"Project {i}" for i in range(3)]
    labels = ln.Record.from_values(project_names, create=True).save()
    project_label.children.add(*labels)
    df = ln.Record.to_dataframe(include="parents__name")
    assert df.columns[2] == "parents__name"
    assert df["parents__name"].iloc[0] == {project_label.name}
    df = ln.Record.to_dataframe(include=["parents__name", "parents__created_by_id"])
    assert df.columns[3] == "parents__created_by_id"
    assert df["parents__name"].iloc[0] == {project_label.name}
    assert set(df["parents__created_by_id"].iloc[0]) == {current_user_id()}

    # for other models
    feature_names = [f"Feature {i}" for i in range(3)]
    features = [ln.Feature(name=name, dtype=int) for name in feature_names]
    ln.save(features)
    schema = ln.Schema(features, name="my schema").save()
    schema.features.set(features)

    df = ln.Schema.filter(name="my schema").to_dataframe(include="features__name")
    assert df.columns[2] == "features__name"
    # order is not conserved
    assert set(df["features__name"].iloc[0]) == set(feature_names)
    # pass a list
    df = ln.Schema.filter(name="my schema").to_dataframe(
        include=["features__name", "features__created_by_id"]
    )
    assert df.columns[3] == "features__created_by_id"
    assert set(df["features__name"].iloc[0]) == set(feature_names)
    assert set(df["features__created_by_id"].iloc[0]) == {current_user_id()}

    # inner join parents on features
    df = ln.Schema.filter().to_dataframe(
        include=["features__name", "features__created_by_id"]
    )
    assert set(df["features__name"].iloc[0]) == set(feature_names)
    assert set(df["features__created_by_id"].iloc[0]) == {current_user_id()}

    # raise error for non many-to-many
    df = ln.Record.filter(name="Project 0").to_dataframe(include="created_by__name")
    assert df["created_by__name"].iloc[0] == ln.setup.settings.user.name

    # do not return fields with no data in the registry
    # does not make sense in Alex's opinion
    # too much magic; got removed in https://github.com/laminlabs/lamindb/pull/2238
    # df = (
    #     ln.Artifact.connect("laminlabs/cellxgene")
    #     .filter(suffix=".h5ad")
    #     .to_dataframe(include=["tissues__name", "pathways__name"])
    # )
    # assert "tissues__name" in df.columns
    # assert "pathways__name" not in df.columns
    # assert df.shape[0] > 0

    # clean up
    project_label.delete(permanent=True)
    for label in labels:
        label.delete(permanent=True)

    schema.delete(permanent=True)
    for feature in features:
        feature.delete(permanent=True)

    # call it from a non-select-derived queryset
    qs = ln.User.objects.all()
    assert qs.to_dataframe().iloc[0]["handle"] == ln.setup.settings.user.handle


def test_complex_df_with_features():
    # should not fail
    ln.Artifact.connect("laminlabs/lamindata").to_dataframe(include="features")
    ln.Run.connect("laminlabs/lamindata").to_dataframe(include="features")
    ln.Artifact.connect("laminlabs/lamindata").to_dataframe(features="queryset")


def test_run_to_dataframe_includes_json_features():
    transform = ln.Transform(key="test_run_to_dataframe_includes_json_features").save()
    run = ln.Run(transform=transform).save()
    feature = ln.Feature(name="run_json_feature", dtype=str).save()

    run.features.set_values({"run_json_feature": "hello"})
    df = ln.Run.filter(id=run.id).to_dataframe(include="features")

    assert "run_json_feature" in df.columns
    assert df["run_json_feature"].iloc[0] == "hello"

    run.delete(permanent=True)
    transform.delete(permanent=True)
    feature.delete(permanent=True)


def test_one_first():
    qs = ln.User.objects.all()
    assert qs.one().handle == ln.setup.settings.user.handle
    assert qs.first().handle == ln.setup.settings.user.handle
    assert qs.one_or_none().handle == ln.setup.settings.user.handle

    description = textwrap.dedent("""\
    User
      Simple fields
    """).strip()
    assert qs.describe(return_str=True).startswith(description)

    qs = ln.User.filter(handle="test")
    with pytest.raises(ln.errors.ObjectDoesNotExist):
        qs.one()
    qs = bt.Source.filter()
    with pytest.raises(ln.errors.MultipleObjectsReturned):
        qs.one()
    with pytest.raises(ln.errors.MultipleObjectsReturned):
        qs.one_or_none()


def test_filter_related_field_name():
    with pytest.raises(
        FieldError,
        match=re.escape(
            "Invalid lookup 'somelabel' for records. Did you mean records__name?"
        ),
    ):
        ln.Artifact.filter(records="somelabel")


def test_filter_unknown_field():
    with pytest.raises(InvalidArgument) as e:
        ln.Artifact.filter(nonexistent="value")
    assert "You can query either by available fields" in str(e)


def test_filter_status_field():
    transform = ln.Transform(key="test_filter_status_field").save()
    run = ln.Run(transform).save()
    run._status_code = 0
    run.save(update_fields=["_status_code"])
    assert ln.Run.filter(status="completed").count() >= 1

    branch = ln.Branch(name="test_filter_status_branch").save()
    branch.status = "review"
    branch.save()
    assert ln.Branch.filter(status="review").count() >= 1

    project = ln.Project(name="test_filter_status_project").save()
    project._status_code = 2
    project.save(update_fields=["_status_code"])
    assert ln.Project.filter(status=2).count() >= 1

    run.delete(permanent=True)
    transform.delete(permanent=True)
    project.delete(permanent=True)
    branch.delete()


def test_get_id_type_error():
    with pytest.raises(
        ValueError, match=re.escape("Field 'id' expected a number but got 'abc'.")
    ):
        ln.Artifact.get(id="abc")


def test_get_related_field_name():
    with pytest.raises(
        FieldError,
        match=re.escape(
            "Invalid lookup 'somelabel' for records. Did you mean records__name?"
        ),
    ):
        ln.Artifact.get(records="somelabel")


def test_get_unknown_field():
    with pytest.raises(FieldError) as e:
        ln.Artifact.get(nonexistent="value")
    assert "Unknown field 'nonexistent'. Available fields:" in str(e)


def test_search():
    label_names = [f"Record {i}" for i in range(3)]
    labels = [ln.Record(name=name) for name in label_names]
    ln.save(labels)
    qs = ln.Record.filter(name__startswith="Record")
    assert qs.search("Record 1")[0].name == "Record 1"
    assert qs.search("Record 1", field=ln.Record.name)[0].name == "Record 1"
    for label in labels:
        label.delete(permanent=True)


def test_lookup():
    qs = ln.User.filter(handle="testuser1")
    # pass str to field
    lookup = qs.lookup(field="handle")
    assert lookup.testuser1.handle == "testuser1"
    # pass StrField to field
    lookup = qs.lookup(field=ln.User.handle)
    assert lookup.testuser1.handle == "testuser1"
    # manager, default field
    qsm = ln.User.filter(handle="testuser1")
    lookup = qsm.lookup()
    assert lookup.testuser1.handle == "testuser1"


def test_inspect():
    qs = ln.User.filter(handle="testuser1")
    assert qs.inspect(["user1", "user2"], "name")["validated"] == []
    assert ln.User.inspect(["user1", "user2"], "name")["validated"] == []
    assert ln.User.inspect(["user1", "user2"], ln.User.name)["validated"] == []
    assert ln.User.inspect("user1", "name")["validated"] == []


def test_validate():
    qs = ln.User.filter(handle="testuser1")
    assert qs.validate(["testuser1", "Test User1"], "handle").tolist() == [True, False]
    assert ln.User.validate(["testuser1", "Test User1"], "handle").tolist() == [
        True,
        False,
    ]
    assert ln.User.validate(["testuser1", "Test User1"], ln.User.handle).tolist() == [
        True,
        False,
    ]
    # returns True
    assert ln.User.validate("testuser1", ln.User.handle)


def test_standardize():
    qs = ln.User.filter(handle="testuser1")
    assert qs.standardize(["user1", "user2"]) == ["user1", "user2"]


def test_get_doesnotexist_error():
    non_existent_label = "some-label-name"

    with pytest.raises(ln.errors.ObjectDoesNotExist) as excinfo:
        ln.Record.get(non_existent_label)

    error_message = str(excinfo.value)
    assert f"No record found with uid '{non_existent_label}'" in error_message
    assert (
        f"Did you forget a keyword as in Record.get(name='{non_existent_label}')?"
        in error_message
    )


@contextmanager
def set_branch(branch: ln.Branch):
    try:
        ln.setup.settings.branch = branch
        yield branch
    finally:
        ln.setup.settings._branch = None
        ln.setup.settings._branch_path.unlink(missing_ok=True)


def test_get_filter_branch():
    branch = ln.Branch(name="test_branch").save()

    artifact = ln.Artifact.from_dataframe(
        ln.User.to_dataframe(), key="df_test_get.parquet"
    )
    artifact.branch = branch
    artifact.save()

    # switch to branch "test_branch"
    with set_branch(branch):
        # errors if doesn't find or multiple records found
        ln.Artifact.get(key="df_test_get.parquet")
        assert ln.Artifact.filter(key="df_test_get.parquet").count() == 1

    # back to main branch
    with pytest.raises(ln.errors.ObjectDoesNotExist):
        ln.Artifact.get(key="df_test_get.parquet")
    assert ln.Artifact.filter(key="df_test_get.parquet").count() == 0
    # test by passing branch directly
    assert (
        ln.Artifact.filter(
            branch=branch,
            key="df_test_get.parquet",
        ).count()
        == 1
    )
    assert (
        ln.Artifact.filter(branch_id=branch.id, key="df_test_get.parquet").count() == 1
    )
    assert (
        ln.Artifact.filter(ln.Q(branch=branch), key="df_test_get.parquet").count() == 1
    )
    assert (
        ln.Artifact.filter(ln.Q(branch_id=branch.id), key="df_test_get.parquet").count()
        == 1
    )

    # errors if doesn't find or multiple records found
    ln.Artifact.get(key="df_test_get.parquet", branch=branch)
    ln.Artifact.get(key="df_test_get.parquet", branch_id=branch.id)
    ln.Artifact.get(key="df_test_get.parquet", branch__in=[branch])
    ln.Artifact.get(key="df_test_get.parquet", branch_id__in=[branch.id])
    ln.Artifact.get(key="df_test_get.parquet", branch=None)
    ln.Artifact.get(key="df_test_get.parquet", branch_id=None)

    ln.Artifact.get(artifact.id)
    ln.Artifact.get(id=artifact.id)
    ln.Artifact.get(id__in=[artifact.id])

    ln.Artifact.get(artifact.uid[:5])
    ln.Artifact.get(uid=artifact.uid)
    ln.Artifact.get(uid__in=[artifact.uid])

    ln.Artifact.get(hash=artifact.hash)
    ln.Artifact.get(hash__in=[artifact.hash])

    artifact.delete(permanent=True)
    branch.delete()


def test_to_class():
    qs = ln.Artifact.filter()
    assert isinstance(qs, QuerySet)
    assert isinstance(qs, ArtifactSet)

    qs_copy = qs._to_non_basic(copy=True)
    assert isinstance(qs_copy, QuerySet)
    assert isinstance(qs_copy, ArtifactSet)

    qs_basic = qs._to_basic(copy=True)
    assert isinstance(qs_basic, BasicQuerySet)
    assert isinstance(qs_basic, ArtifactSet)
    assert not isinstance(qs_basic, QuerySet)

    qs_basic._to_non_basic(copy=False)
    assert isinstance(qs_basic, QuerySet)
    assert isinstance(qs_basic, ArtifactSet)


def test_queryset_soft_delete_error():
    with pytest.raises(ValueError):
        ln.Storage.filter().delete(permanent=False)

    with pytest.raises(ValueError):
        ln.Branch.filter().delete(permanent=False)


def test_encode_lamindb_fields_as_columns():
    from lamindb.models.query_set import encode_lamindb_fields_as_columns

    assert encode_lamindb_fields_as_columns(
        ln.Artifact, ["uid", "name", "created_by", "key", "tissues"]
    ) == {
        "uid": "__lamindb_artifact_uid__",
        "created_by": "__lamindb_artifact_created_by__",
        "key": "__lamindb_artifact_key__",
    }
    assert encode_lamindb_fields_as_columns(
        ln.Record, ["uid", "name", "created_by", "key", "tissues"]
    ) == {
        "uid": "__lamindb_record_uid__",
        "name": "__lamindb_record_name__",
        "created_by": "__lamindb_record_created_by__",
    }


# def test_connect_public_clone_instance():
#     # become an anonymous user
#     ln_setup.logout()

#     try:
#         from django.db import connections

#         connections.databases.pop("laminlabs/arc-virtual-cell-atlas", None)

#         qs = ln.Artifact.connect("laminlabs/arc-virtual-cell-atlas")

#         assert qs.db == "laminlabs/arc-virtual-cell-atlas"

#         # Verify the connection is SQLite, not Postgres
#         assert (
#             "sqlite"
#             in connections.databases["laminlabs/arc-virtual-cell-atlas"]["ENGINE"]
#         )

#         # Verify we can actually query it
#         result = qs.filter().first()
#         assert result is not None
#     finally:
#         # log back in to ensure that other tests do not break
#         login_testuser2(session=None)
#         login_testuser1(session=None)
#         ln_setup.connect("lamindb-unit-tests-core")


================================================
FILE: tests/core/test_record_basics.py
================================================
import os
import re
from datetime import date, datetime

import bionty as bt
import lamindb as ln
import pandas as pd
import pytest
from django.db import IntegrityError
from lamindb.errors import FieldValidationError
from lamindb.models.record import IMPORTS_UID, SCHEMA_IMPORTS_UID


def test_record_docstring_examples():
    # create a feature if you don't yet have one
    gc_content = ln.Feature(name="gc_content", dtype=float).save()

    # create a record to track a sample
    sample1 = ln.Record(name="Sample 1", features={"gc_content": 0.5}).save()

    # describe the record
    sample1.describe()

    # create a flexible record type to track experiments
    experiment_type = ln.Record(name="Experiment", is_type=True).save()
    experiment1 = ln.Record(name="Experiment 1", type=experiment_type).save()

    # create a feature to link experiments
    experiment = ln.Feature(name="experiment", dtype=experiment_type).save()

    # create a record type to track samples that's constrained with a schema
    schema = ln.Schema(
        [experiment, gc_content.with_config(optional=True)], name="sample_schema"
    ).save()
    sample_sheet = ln.Record(name="Sample Sheet", is_type=True, schema=schema).save()

    # group the sample1 record under the sample sheet
    sample1.type = sample_sheet
    sample1.save()

    # reset the feature values for the record including the experiment
    sample1.features.set_values(
        {
            "gc_content": 0.5,
            "experiment": "Experiment 1",  # automatically resolves by name, also accepts the experiment1 object
        }
    )

    # Export all records under a type to a dataframe
    df = experiment_type.to_dataframe()
    assert "Experiment 1" in df["__lamindb_record_name__"].values

    # If you try to set incomplete features in a record in a sheet, you'll get a validation error
    sample2 = ln.Record(name="Sample 2", type=sample_sheet).save()
    with pytest.raises(ln.errors.ValidationError):
        sample2.features.set_values({"gc_content": 0.6})

    # Query records by features
    assert ln.Record.filter(gc_content=0.5).one() == sample1
    assert ln.Record.filter(gc_content__gt=0.4).one() == sample1
    assert ln.Record.filter(type=sample_sheet).count() >= 1

    # Clean up
    sample1.delete(permanent=True)
    sample2.delete(permanent=True)
    experiment1.delete(permanent=True)
    sample_sheet.delete(permanent=True)
    schema.delete(permanent=True)
    experiment_type.delete(permanent=True)
    gc_content.delete(permanent=True)
    experiment.delete(permanent=True)


def test_record_initialization():
    with pytest.raises(
        FieldValidationError,
        match=re.escape(
            "Only name, type, is_type, features, description, schema, reference, reference_type are valid keyword arguments"
        ),
    ):
        ln.Record(x=1)

    with pytest.raises(ValueError) as error:
        ln.Record(1)
    assert error.exconly() == "ValueError: Only one non-keyword arg allowed"


def test_record_lazy_features_on_save():
    score_feature = ln.Feature(name="lazy_score", dtype=float).save()
    record = ln.Record(name="lazy-record", features={"lazy_score": 0.7}).save()

    assert not hasattr(record, "_features")
    assert ln.Record.filter(lazy_score=0.7).one().id == record.id

    record.delete(permanent=True)
    score_feature.delete(permanent=True)


def test_record_from_dataframe_bulk_save_paths():
    score = ln.Feature(name="from-df-score", dtype=float).save()
    schema = ln.Schema([score], name="from-df-schema").save()
    sheet = ln.Record(name="from-df-sheet", is_type=True, schema=schema).save()
    df = pd.DataFrame(
        {
            "__lamindb_record_name__": ["from-df-a", "from-df-b"],
            "from-df-score": [1.0, 2.0],
        }
    )

    records = ln.Record.from_dataframe(df, type=sheet)
    assert len(records) == 2
    records.save()
    assert ln.Record.get(name="from-df-a").features.get_values()["from-df-score"] == 1.0

    df2 = pd.DataFrame(
        {
            "__lamindb_record_name__": ["from-df-c"],
            "from-df-score": [3.0],
        }
    )
    records_2 = ln.Record.from_dataframe(df2, type=sheet)
    records_2.save()
    assert ln.Record.get(name="from-df-c").features.get_values()["from-df-score"] == 3.0

    ln.Record.filter(name__in=["from-df-a", "from-df-b", "from-df-c"]).delete(
        permanent=True
    )
    ln.Record.filter(name="from-df-sheet").delete(permanent=True)
    schema.delete(permanent=True)
    score.delete(permanent=True)


def test_record_from_dataframe_requires_named_type():
    df = pd.DataFrame({"__lamindb_record_name__": ["x"], "score": [1.0]})
    non_type_record = ln.Record(name="from-df-non-type").save()
    unnamed_type = ln.Record(name="from-df-temp-type", is_type=True)
    unnamed_type.name = None

    with pytest.raises(ValueError, match="is_type=True"):
        ln.Record.from_dataframe(df, type=non_type_record)
    with pytest.raises(ValueError, match="non-null `name`"):
        ln.Record.from_dataframe(df, type=unnamed_type)

    non_type_record.delete(permanent=True)


def test_record_from_dataframe_with_string_type_creates_import_type():
    score = ln.Feature(name="from-df-str-score", dtype=float).save()
    df = pd.DataFrame(
        {
            "__lamindb_record_name__": ["from-df-str-a", "from-df-str-b"],
            "from-df-str-score": [11.0, 12.0],
        }
    )
    imports_type = ln.Record.filter(uid=IMPORTS_UID).one_or_none()
    original_imports_name = None
    if imports_type is not None:
        original_imports_name = imports_type.name
        imports_type.name = "from-df-renamed-imports-parent"
        imports_type.save()

    try:
        records = ln.Record.from_dataframe(df, type="from-df-str-type")
        created_type = ln.Record.get(name="from-df-str-type", is_type=True)
        imports_type = ln.Record.get(uid=IMPORTS_UID)

        assert len(records) == 2
        assert records.type.id == created_type.id
        assert created_type.type_id == imports_type.id
        assert created_type.schema.type is not None
        assert created_type.schema.type.uid == SCHEMA_IMPORTS_UID
        assert created_type.schema_id is not None

        records.save()
        assert (
            ln.Record.get(name="from-df-str-a").features.get_values()[
                "from-df-str-score"
            ]
            == 11.0
        )
    finally:
        created_type = ln.Record.filter(
            name="from-df-str-type", is_type=True
        ).one_or_none()
        ln.Record.filter(name__in=["from-df-str-a", "from-df-str-b"]).delete(
            permanent=True
        )
        ln.Record.filter(name="from-df-str-type").delete(permanent=True)
        if created_type is not None and created_type.schema_id is not None:
            ln.Schema.filter(id=created_type.schema_id).delete(permanent=True)
        if original_imports_name is not None:
            imports_type = ln.Record.get(uid=IMPORTS_UID)
            imports_type.name = original_imports_name
            imports_type.save()
        score.delete(permanent=True)


def test_record_from_dataframe_with_string_type_duplicate_name_errors():
    score = ln.Feature(name="from-df-dup-score", dtype=float).save()
    schema = ln.Schema([score], name="from-df-dup-schema").save()
    imports_type = ln.Record.filter(uid=IMPORTS_UID).one_or_none()
    if imports_type is None:
        imports_type = ln.Record(name="Imports", is_type=True)
        imports_type.uid = IMPORTS_UID
        imports_type = imports_type.save()
    ln.Record(
        name="from-df-dup-type", is_type=True, schema=schema, type=imports_type
    ).save()
    df = pd.DataFrame(
        {
            "__lamindb_record_name__": ["from-df-dup-a"],
            "from-df-dup-score": [21.0],
        }
    )

    with pytest.raises(ValueError, match="already exists"):
        ln.Record.from_dataframe(df, type="from-df-dup-type")

    ln.Record.filter(name="from-df-dup-type").delete(permanent=True)
    schema.delete(permanent=True)
    score.delete(permanent=True)


def test_feature_manager_raise_not_validated_values():
    from lamindb.models._feature_manager import FeatureManager

    assert FeatureManager._raise_not_validated_values({}) is None

    with pytest.raises(ln.errors.ValidationError) as error:
        FeatureManager._raise_not_validated_values(
            {
                "Record": ("name", ["missing-record"]),
                "bionty.Gene": ("symbol", ["missing-gene"]),
            }
        )
    message = str(error.value)
    assert "These values could not be validated" in message
    assert (
        "records = ln.Record.from_values(['missing-record'], field='name', create=True).save()"
        in message
    )
    assert (
        "records = bionty.Gene.from_values(['missing-gene'], field='symbol').save()"
        in message
    )


def test_name_lookup():
    my_type = ln.Record(name="MyType", is_type=True).save()
    label1 = ln.Record(name="label 1", type=my_type).save()
    label2 = ln.Record(name="label 1", type=my_type)
    assert label2 == label1
    label2 = ln.Record(name="label 1")
    assert label2 != label1
    label2.save()
    label3 = ln.Record(name="label 1")
    assert label3 == label2
    label2.delete(permanent=True)
    label1.delete(permanent=True)
    my_type.delete(permanent=True)


@pytest.mark.skipif(
    os.getenv("LAMINDB_TEST_DB_VENDOR") == "sqlite", reason="Postgres-only"
)
def test_invalid_type_record_with_schema():
    schema = ln.Schema(name="test_schema", itype=ln.Feature).save()

    record_type_with_schema = ln.Record(
        name="TypeWithSchema", is_type=True, schema=schema
    ).save()

    with pytest.raises(IntegrityError) as error:
        ln.Record(name="InvalidType", is_type=True, type=record_type_with_schema).save()
    assert "record_type_is_valid_fk" in error.exconly()

    record_type_with_schema.delete(permanent=True)
    schema.delete(permanent=True)


# see test_artifact_features_add_remove_query in test_artifact_external_features_annotations.py for similar test for Artifacts (populate and query by features)
def test_record_features_add_remove_values():
    record_type1 = ln.Record(name="RecordType1", is_type=True).save()
    record_entity1 = ln.Record(name="entity1", type=record_type1).save()
    record_entity2 = ln.Record(name="entity2", type=record_type1).save()
    ulabel = ln.ULabel(name="test-ulabel").save()
    artifact = ln.Artifact(".gitignore", key="test-artifact").save()
    collection = ln.Collection(artifact, key="test-collection").save()
    transform = ln.Transform(key="test-transform").save()
    run = ln.Run(transform, name="test-run").save()

    feature_bool = ln.Feature(name="feature_bool", dtype=bool).save()
    feature_str = ln.Feature(name="feature_str", dtype=str).save()
    feature_list_str = ln.Feature(name="feature_list_str", dtype=list[str]).save()
    feature_int = ln.Feature(name="feature_int", dtype=int).save()
    feature_list_int = ln.Feature(name="feature_list_int", dtype=list[int]).save()
    feature_float = ln.Feature(name="feature_float", dtype=float).save()
    feature_list_float = ln.Feature(name="feature_list_float", dtype=list[float]).save()
    feature_num = ln.Feature(name="feature_num", dtype="num").save()
    feature_url = ln.Feature(name="feature_url", dtype="url").save()
    feature_list_num = ln.Feature(name="feature_list_num", dtype="list[num]").save()
    feature_datetime = ln.Feature(name="feature_datetime", dtype=datetime).save()
    feature_date = ln.Feature(name="feature_date", dtype=datetime.date).save()
    feature_dict = ln.Feature(name="feature_dict", dtype=dict).save()
    feature_type1 = ln.Feature(name="feature_type1", dtype=record_type1).save()
    feature_type1s = ln.Feature(name="feature_type1s", dtype=list[record_type1]).save()
    feature_user = ln.Feature(name="feature_user", dtype=ln.User).save()
    feature_ulabel = ln.Feature(name="feature_ulabel", dtype=ln.ULabel).save()
    feature_project = ln.Feature(name="feature_project", dtype=ln.Project).save()
    feature_artifact = ln.Feature(name="feature_artifact", dtype=ln.Artifact).save()
    feature_collection = ln.Feature(
        name="feature_collection", dtype=ln.Collection
    ).save()
    feature_run = ln.Feature(name="feature_run", dtype=ln.Run.uid).save()
    feature_cell_line = ln.Feature(name="feature_cell_line", dtype=bt.CellLine).save()
    feature_cell_lines = ln.Feature(
        name="feature_cell_lines", dtype=list[bt.CellLine]
    ).save()
    feature_cl_ontology_id = ln.Feature(
        name="feature_cl_ontology_id", dtype=bt.CellLine.ontology_id
    ).save()
    feature_gene = ln.Feature(name="feature_gene", dtype=bt.Gene).save()

    test_record = ln.Record(name="test_record").save()
    test_project = ln.Project(name="test_project").save()
    hek293 = bt.CellLine.from_source(name="HEK293").save()
    a549 = bt.CellLine.from_source(name="A-549").save()
    tmem276 = bt.Gene.from_source(symbol="Tmem276", organism="mouse").save()

    # test feature.dtype_as_object
    assert feature_bool.dtype_as_object is bool
    assert feature_str.dtype_as_object is str
    assert feature_list_str.dtype_as_object == list[str]
    assert feature_int.dtype_as_object is int
    assert feature_list_int.dtype_as_object == list[int]
    assert feature_float.dtype_as_object is float
    assert feature_list_float.dtype_as_object == list[float]
    assert feature_num.dtype_as_object is float
    assert feature_url.dtype_as_object is str
    assert feature_list_num.dtype_as_object == list[float]
    assert feature_datetime.dtype_as_object == datetime
    assert feature_date.dtype_as_object == date
    assert feature_dict.dtype_as_object is dict
    assert feature_type1.dtype_as_object == record_type1
    assert feature_type1s.dtype_as_object == list[record_type1]
    assert feature_user.dtype_as_object == ln.User.handle
    assert feature_ulabel.dtype_as_object == ln.ULabel.name
    assert feature_project.dtype_as_object == ln.Project.name
    assert feature_artifact.dtype_as_object == ln.Artifact.key
    assert feature_collection.dtype_as_object == ln.Collection.key
    assert feature_run.dtype_as_object == ln.Run.uid
    assert feature_cell_line.dtype_as_object == bt.CellLine.name
    assert feature_cell_lines.dtype_as_object == list[bt.CellLine.name]
    assert feature_cl_ontology_id.dtype_as_object == bt.CellLine.ontology_id
    assert feature_gene.dtype_as_object == bt.Gene.symbol

    # no schema validation
    test_values = {
        "feature_bool": True,
        "feature_str": "00810702-0006",  # this string value could be cast to datetime! don't change!
        "feature_list_str": ["a", "list", "of", "strings"],
        "feature_int": 42,
        "feature_list_int": [1, 2, 3],
        "feature_num": 3.14,
        "feature_url": "https://lamin.ai/docs",
        "feature_list_num": [2.71, 3.14, 1.61],
        "feature_float": 3.14,
        "feature_list_float": [2.71, 3.14, 1.61],
        "feature_datetime": datetime(2024, 1, 1, 12, 0, 0),
        "feature_date": date(2024, 1, 1),
        "feature_dict": {"key": "value", "number": 123, "list": [1, 2, 3]},
        "feature_type1": "entity1",
        "feature_type1s": ["entity1", "entity2"],
        "feature_ulabel": "test-ulabel",
        "feature_user": ln.setup.settings.user.handle,
        "feature_project": "test_project",
        "feature_cell_line": "HEK293",
        "feature_cell_lines": ["HEK293", "A-549"],
        "feature_gene": "Tmem276",
        "feature_cl_ontology_id": "CVCL_0045",
        "feature_artifact": "test-artifact",
        "feature_collection": "test-collection",
        "feature_run": run.uid,
    }

    test_record.features.add_values(test_values)
    assert test_record.features.get_values() == test_values

    # --- Query by features (same data as above) ---
    # Equality
    assert ln.Record.filter(feature_str=test_values["feature_str"]).one() == test_record
    assert ln.Record.filter(feature_int=42).one() == test_record
    assert ln.Record.filter(feature_type1="entity1").one() == test_record
    assert ln.Record.filter(feature_cell_line="HEK293").one() == test_record
    assert ln.Record.filter(feature_url="https://lamin.ai/docs").one() == test_record
    assert (
        ln.Record.filter(feature_str=test_values["feature_str"], feature_int=42).one()
        == test_record
    )
    # Datetime and date (filter uses ISO strings as stored in JSON)
    assert ln.Record.filter(feature_datetime="2024-01-01T12:00:00").one() == test_record
    assert ln.Record.filter(feature_date="2024-01-01").one() == test_record
    # __contains (categorical)
    assert ln.Record.filter(feature_cell_line__contains="HEK").one() == test_record
    assert ln.Record.filter(feature_type1__contains="entity").one() == test_record
    # Invalid field
    with pytest.raises(ln.errors.InvalidArgument) as error:
        ln.Record.filter(feature_str_typo="x", feature_int=42).one()
    assert error.exconly().startswith(
        "lamindb.errors.InvalidArgument: You can query either by available fields:"
    )
    # DoesNotExist (no Record named "nonexistent_entity" exists)
    with pytest.raises(ln.errors.ObjectDoesNotExist) as error:
        ln.Record.filter(feature_type1="nonexistent_entity").one()
    assert "Did not find" in error.exconly()

    # Combined filter (3 keys)
    assert (
        ln.Record.filter(
            feature_str=test_values["feature_str"],
            feature_int=42,
            feature_type1="entity1",
        ).one()
        == test_record
    )
    # Bionty: filter by record
    assert ln.Record.filter(feature_cell_line=hek293).one() == test_record
    # Bionty: filter by ontology_id string
    assert ln.Record.filter(feature_cl_ontology_id="CVCL_0045").one() == test_record
    # Bionty __contains (ontology_id)
    assert (
        ln.Record.filter(feature_cl_ontology_id__contains="0045").one() == test_record
    )
    # DoesNotExist (Record not found: feature_project)
    with pytest.raises(ln.errors.ObjectDoesNotExist) as error:
        ln.Record.filter(feature_project="nonexistent_project").one()
    assert "Did not find" in error.exconly()
    # __contains returns multiple (add second record, assert, then remove)
    value_record = ln.Record(name="query_test_value_record").save()
    value_record.features.add_values({"feature_type1": "entity2"})
    assert len(ln.Record.filter(feature_type1__contains="entity")) == 2
    value_record.features.remove_values("feature_type1")
    value_record.delete(permanent=True)
    # Numeric comparators __lt, __gt (int, float, num)
    assert ln.Record.filter(feature_int__lt=21).one_or_none() is None
    assert len(ln.Record.filter(feature_int__gt=21)) >= 1
    # int __lt/__gt that would fail with string comparison (42 vs 5, 42 vs 100)
    assert ln.Record.filter(feature_int__lt=5).one_or_none() is None
    assert ln.Record.filter(feature_int__gt=100).one_or_none() is None
    # float/num __lt/__gt (numeric comparison on SQLite via json_extract + CAST)
    assert ln.Record.filter(feature_float__lt=5.0).one() == test_record
    assert ln.Record.filter(feature_float__gt=1.0).one() == test_record
    assert ln.Record.filter(feature_float__gt=10.0).one_or_none() is None
    assert ln.Record.filter(feature_num__lt=5.0).one() == test_record
    assert ln.Record.filter(feature_num__gt=1.0).one() == test_record
    assert ln.Record.filter(feature_num__gt=10.0).one_or_none() is None
    # Date and datetime comparators (ISO strings)
    assert ln.Record.filter(feature_date__lt="2024-01-02").one() == test_record
    assert ln.Record.filter(feature_date__gt="2023-12-31").one() == test_record
    assert ln.Record.filter(feature_date__gt="2024-01-02").one_or_none() is None
    assert (
        ln.Record.filter(feature_datetime__lt="2024-01-01T13:00:00").one()
        == test_record
    )
    assert (
        ln.Record.filter(feature_datetime__gt="2024-01-01T11:00:00").one()
        == test_record
    )
    assert (
        ln.Record.filter(feature_datetime__lt="2024-01-01T11:00:00").one_or_none()
        is None
    )

    # ManyToMany accesors

    assert set(test_record.linked_records.to_list()) == {record_entity1, record_entity2}
    assert test_record.linked_in_records.count() == 0
    assert set(record_entity1.linked_in_records.to_list()) == {test_record}
    assert set(record_entity2.linked_in_records.to_list()) == {test_record}
    assert record_entity1.linked_records.count() == 0
    assert record_entity2.linked_records.count() == 0

    # all empty sheet

    schema = ln.Schema(
        [
            feature_bool,
            feature_str,
            feature_int,
            feature_list_str,
            feature_list_int,
            feature_num,
            feature_url,
            feature_float,
            feature_list_float,
            feature_list_num,
            feature_datetime,
            feature_date,
            feature_dict,
            feature_type1,
            feature_type1s,
            feature_ulabel,
            feature_user,
            feature_project,
            feature_cell_line,
            feature_cell_lines,
            feature_cl_ontology_id,
            feature_gene,
            feature_artifact,
            feature_collection,
            feature_run,
        ],
        name="test_schema",
    ).save()
    sheet = ln.Record(name="Sheet", is_type=True, schema=schema).save()
    empty_record = ln.Record(name="empty_record", type=sheet).save()
    df_empty = sheet.to_dataframe()

    assert df_empty["feature_bool"].isnull().all()
    assert df_empty["feature_bool"].dtype.name == "boolean"
    assert df_empty["feature_str"].isnull().all()
    assert df_empty["feature_str"].dtype.name == "string"
    assert df_empty["feature_int"].isnull().all()
    assert df_empty["feature_int"].dtype.name == "Int64"
    assert df_empty["feature_float"].isnull().all()
    assert df_empty["feature_float"].dtype.name == "float64"
    assert df_empty["feature_num"].isnull().all()
    assert df_empty["feature_num"].dtype.name == "float64"
    assert df_empty["feature_url"].isnull().all()
    assert df_empty["feature_url"].dtype.name == "string"
    assert df_empty["feature_list_str"].isnull().all()
    assert df_empty["feature_list_str"].dtype.name == "object"
    assert df_empty["feature_list_int"].isnull().all()
    assert df_empty["feature_list_int"].dtype.name == "object"
    assert df_empty["feature_datetime"].isnull().all()
    assert df_empty["feature_datetime"].dtype.name == "datetime64[ns]"
    assert df_empty["feature_date"].isnull().all()
    assert df_empty["feature_date"].dtype.name == "object"
    assert df_empty["feature_dict"].isnull().all()
    assert df_empty["feature_dict"].dtype.name == "object"
    assert df_empty["feature_type1"].isnull().all()
    assert df_empty["feature_type1"].dtype.name == "category"
    assert df_empty["feature_type1s"].isnull().all()
    assert df_empty["feature_type1s"].dtype.name == "object"
    assert df_empty["feature_ulabel"].isnull().all()
    assert df_empty["feature_ulabel"].dtype.name == "category"
    assert df_empty["feature_user"].isnull().all()
    assert df_empty["feature_user"].dtype.name == "category"
    assert df_empty["feature_project"].isnull().all()
    assert df_empty["feature_project"].dtype.name == "category"
    assert df_empty["feature_cell_line"].isnull().all()
    assert df_empty["feature_cell_line"].dtype.name == "category"
    assert df_empty["feature_cell_lines"].isnull().all()
    assert df_empty["feature_cell_lines"].dtype.name == "object"
    assert df_empty["feature_cl_ontology_id"].isnull().all()
    assert df_empty["feature_cl_ontology_id"].dtype.name == "category"
    assert df_empty["feature_artifact"].isnull().all()
    assert df_empty["feature_artifact"].dtype.name == "category"
    assert df_empty["feature_collection"].isnull().all()
    assert df_empty["feature_collection"].dtype.name == "category"
    assert df_empty["feature_run"].isnull().all()
    assert df_empty["feature_run"].dtype.name == "category"

    # remove empty record from sheet
    empty_record.type = None
    empty_record.save()

    # sheet with values

    test_record.type = sheet
    test_record.save()
    df = sheet.to_dataframe()
    target_result = {
        "feature_bool": True,
        "feature_str": "00810702-0006",  # this string value could be cast to datetime!
        "feature_list_str": ["a", "list", "of", "strings"],
        "feature_int": 42,
        "feature_list_int": [1, 2, 3],
        "feature_float": 3.14,
        "feature_list_float": [2.71, 3.14, 1.61],
        "feature_num": 3.14,
        "feature_url": "https://lamin.ai/docs",
        "feature_list_num": [2.71, 3.14, 1.61],
        "feature_datetime": pd.Timestamp("2024-01-01 12:00:00"),
        "feature_date": date(2024, 1, 1),
        "feature_dict": {"key": "value", "list": [1, 2, 3], "number": 123},
        "feature_type1": "entity1",
        "feature_ulabel": "test-ulabel",
        "feature_user": ln.setup.settings.user.handle,
        "feature_project": "test_project",
        "feature_cell_line": "HEK293",
        "feature_cl_ontology_id": "CVCL_0045",
        "feature_gene": "Tmem276",
        "feature_artifact": "test-artifact",
        "feature_collection": "test-collection",
        "feature_run": run.uid,
        "__lamindb_record_uid__": test_record.uid,
        "__lamindb_record_name__": "test_record",
    }
    result = df.to_dict(orient="records")[0]
    # need to handle categorical lists differently because
    # we don't yet respect ordering
    result_feature_type1s = result.pop("feature_type1s")
    assert set(result_feature_type1s) == {"entity1", "entity2"}
    assert isinstance(result_feature_type1s, list)
    result_feature_cell_lines = result.pop("feature_cell_lines")
    assert set(result_feature_cell_lines) == {"HEK293", "A-549"}
    assert isinstance(result_feature_cell_lines, list)
    assert result == target_result

    # export to artifact to trigger validation -- this will raise many errors if anything is inconsistent

    sheet_as_artifact = sheet.to_artifact()

    # could devise a test for get_values or features.describe()
    # but this is extensively tested elsewhere
    # print(sheet_as_artifact.features.get_values())
    # assert sheet_as_artifact.features.get_values()

    sheet_as_artifact.delete(permanent=True)

    # add the empty record back to the sheet and export again

    empty_record.type = sheet
    empty_record.save()
    df = sheet.to_dataframe()
    sheet_as_artifact = sheet.to_artifact()
    sheet_as_artifact.delete(permanent=True)

    # test passing ISO-format date string for date

    test_record2 = ln.Record(name="test_record").save()
    # we could also test different ways of formatting but don't yet do that
    # in to_dataframe() we enforce ISO format already
    feature_date = ln.Feature.get(name="feature_date")
    feature_date.coerce = True  # have to allow coercion because we're passing a string
    feature_date.save()
    test_values["feature_date"] = "2024-01-02"
    test_record2.features.add_values(test_values)
    test_record2.type = sheet
    test_record2.save()
    test_values["feature_date"] = date(2024, 1, 2)
    assert test_record2.features.get_values() == test_values
    assert test_record.features.get_values() != test_values

    # also test export to artifact again
    sheet_as_artifact = sheet.to_artifact()
    sheet_as_artifact.delete(permanent=True)
    test_record2.delete(permanent=True)
    empty_record.delete(permanent=True)

    # test move a value into the trash

    record_entity1.delete()
    test_values.pop("feature_type1")
    test_values["feature_type1s"] = ["entity2"]
    test_values["feature_date"] = date(2024, 1, 1)
    assert test_record.features.get_values() == test_values

    df = sheet.to_dataframe()
    result = df.to_dict(orient="records")[0]
    result_feature_type1s = result.pop("feature_type1s")
    assert set(result_feature_type1s) == {"entity2"}
    assert isinstance(result_feature_type1s, list)
    result_feature_cell_lines = result.pop("feature_cell_lines")
    assert set(result_feature_cell_lines) == {"HEK293", "A-549"}
    assert isinstance(result_feature_cell_lines, list)
    target_result.pop("feature_type1")
    assert pd.isna(result.pop("feature_type1"))
    assert result == target_result

    record_entity1.restore()
    test_values["feature_type1"] = "entity1"
    test_values["feature_type1s"] = ["entity1", "entity2"]

    # remove values

    test_record.features.remove_values("feature_int")
    test_values.pop("feature_int")
    assert test_record.features.get_values() == test_values

    test_record.features.remove_values("feature_date")
    test_values.pop("feature_date")
    assert test_record.features.get_values() == test_values

    test_record.features.remove_values("feature_type1")
    test_values.pop("feature_type1")
    assert test_record.features.get_values() == test_values

    test_record.features.remove_values("feature_type1s")
    test_values.pop("feature_type1s")
    assert test_record.features.get_values() == test_values

    test_record.features.remove_values("feature_ulabel")
    test_values.pop("feature_ulabel")
    assert test_record.features.get_values() == test_values

    test_record.features.remove_values("feature_cell_line")
    test_values.pop("feature_cell_line")
    assert test_record.features.get_values() == test_values

    test_record.features.remove_values("feature_user")
    test_values.pop("feature_user")
    assert test_record.features.get_values() == test_values

    test_record.features.remove_values("feature_artifact")
    test_values.pop("feature_artifact")
    assert test_record.features.get_values() == test_values

    test_record.features.remove_values("feature_collection")
    test_values.pop("feature_collection")
    assert test_record.features.get_values() == test_values

    test_record.features.remove_values("feature_run")
    test_values.pop("feature_run")
    assert test_record.features.get_values() == test_values

    # test passing None has no effect, does not lead to annotation

    sheet.schema = None
    sheet.save()
    schema.delete(permanent=True)

    test_record.features.add_values({"feature_int": None, "feature_type1": None})
    assert test_record.features.get_values() == test_values

    # schema validation

    feature_str = ln.Feature.get(name="feature_str")
    feature_int = ln.Feature.get(name="feature_int")
    schema = ln.Schema([feature_str, feature_int], name="test_schema").save()
    test_form = ln.Record(name="TestForm", is_type=True, schema=schema).save()
    test_record_in_form = ln.Record(name="test_record_in_form", type=test_form).save()
    with pytest.raises(ln.errors.ValidationError) as error:
        test_record_in_form.features.add_values({"feature_type1": "entity1"})
    assert "COLUMN_NOT_IN_DATAFRAME" in error.exconly()
    test_record_in_form.delete(permanent=True)
    test_form.delete(permanent=True)
    schema.delete(permanent=True)

    # test with list of strings

    schema = ln.Schema([feature_cell_lines], name="test_schema2").save()
    test_form = ln.Record(name="TestForm", is_type=True, schema=schema).save()
    test_record_in_form = ln.Record(name="test_record_in_form", type=test_form).save()
    test_record_in_form.features.add_values({"feature_cell_lines": ["HEK293", "A-549"]})
    test_record_in_form.delete(permanent=True)
    test_form.delete(permanent=True)
    schema.delete(permanent=True)

    # test with list of records (rather than passing strings)

    schema = ln.Schema([feature_cell_lines], name="test_schema2").save()
    test_form = ln.Record(name="TestForm", is_type=True, schema=schema).save()
    test_record_in_form = ln.Record(name="test_record_in_form", type=test_form).save()
    test_record_in_form.features.add_values({"feature_cell_lines": [a549, hek293]})
    test_record_in_form.delete(permanent=True)
    test_form.delete(permanent=True)
    schema.delete(permanent=True)

    # clean up rest
    test_record_id = test_record.id
    assert ln.models.RecordJson.filter(record_id=test_record_id).count() > 0
    test_record.delete(permanent=True)
    # test CASCADE deletion of RecordJson
    assert ln.models.RecordJson.filter(record_id=test_record_id).count() == 0
    sheet.delete(permanent=True)
    feature_str.delete(permanent=True)
    feature_list_str.delete(permanent=True)
    feature_int.delete(permanent=True)
    feature_list_int.delete(permanent=True)
    feature_datetime.delete(permanent=True)
    feature_date.delete(permanent=True)
    feature_type1.delete(permanent=True)
    feature_type1s.delete(permanent=True)
    feature_ulabel.delete(permanent=True)
    feature_user.delete(permanent=True)
    feature_project.delete(permanent=True)
    feature_dict.delete(permanent=True)
    feature_artifact.delete(permanent=True)
    feature_run.delete(permanent=True)
    feature_cell_lines.delete(permanent=True)
    record_entity1.delete(permanent=True)
    record_entity2.delete(permanent=True)
    record_type1.delete(permanent=True)
    test_project.delete(permanent=True)
    feature_cell_line.delete(permanent=True)
    feature_cl_ontology_id.delete(permanent=True)
    feature_collection.delete(permanent=True)
    feature_gene.delete(permanent=True)
    hek293.delete(permanent=True)
    a549.delete(permanent=True)
    tmem276.delete(permanent=True)
    ulabel.delete(permanent=True)
    collection.delete(permanent=True)
    artifact.delete(permanent=True)
    run.delete(permanent=True)
    transform.delete(permanent=True)
    feature_num.delete(permanent=True)
    feature_url.delete(permanent=True)


def test_date_and_datetime_corruption():
    feature_datetime = ln.Feature(
        name="feature_datetime", dtype=datetime, coerce=True
    ).save()
    feature_date = ln.Feature(
        name="feature_date", dtype=datetime.date, coerce=True
    ).save()
    schema = ln.Schema(
        [feature_datetime, feature_date], name="test_schema_date_datetime"
    ).save()
    test_sheet = ln.Record(name="TestSheet", is_type=True).save()
    record = ln.Record(name="test_record", type=test_sheet).save()

    # pass values with Z suffix
    test_values = {
        "feature_datetime": "2024-01-01T12:00:00Z",
        "feature_date": "2025-01-17",
    }
    record.features.add_values(test_values)
    date_value = ln.models.RecordJson.get(record=record, feature=feature_date)
    # manually corrupt the value
    date_value.value = "2025-01-17T00:00:00.000Z"
    date_value.save()
    assert record.features.get_values() == {
        "feature_datetime": pd.Timestamp("2024-01-01 12:00:00", tz="UTC"),
        "feature_date": date(2025, 1, 17),
    }
    record.schema = schema
    record.save()

    df = test_sheet.to_dataframe()
    result = df.to_dict(orient="records")[0]
    # because in a dataframe we'll hit pandera and pandera expects naive
    # timestamps, to_dataframe() converts to naive by removing timezone info
    assert result["feature_datetime"] == pd.Timestamp("2024-01-01 12:00:00")
    assert result["feature_date"] == date(2025, 1, 17)

    record.delete(permanent=True)
    test_sheet.delete(permanent=True)
    schema.delete(permanent=True)
    feature_datetime.delete(permanent=True)
    feature_date.delete(permanent=True)


def test_only_list_type_features_and_field_qualifiers():
    # this test is necessary because the logic for adding link tables
    # to the query previously only fired when a non-list cat feature of the same type was present
    feature_cell_lines = ln.Feature(
        name="feature_cell_lines", dtype=list[bt.CellLine]
    ).save()
    feature_list_ontology_id = ln.Feature(
        name="feature_list_ontology_id", dtype=list[bt.Tissue.ontology_id]
    ).save()
    schema = ln.Schema(
        [feature_cell_lines, feature_list_ontology_id], name="test_schema2"
    ).save()
    # create a feature with the same name to test robustness w.r.t. to this
    feature_type = ln.Feature(name="FeatureTypeX", is_type=True).save()
    feature_cell_lines_duplicate = ln.Feature(
        name="feature_cell_lines", dtype=bt.CellLine, type=feature_type
    ).save()

    test_sheet = ln.Record(name="TestSheet", is_type=True, schema=schema).save()
    record = ln.Record(name="test_record", type=test_sheet).save()
    hek293 = bt.CellLine.from_source(name="HEK293").save()
    a549 = bt.CellLine.from_source(name="A-549").save()
    uberon2369 = bt.Tissue.from_source(ontology_id="UBERON:0002369").save()
    uberon5172 = bt.Tissue.from_source(ontology_id="UBERON:0005172").save()

    test_values = {
        "feature_cell_lines": ["HEK293", "A-549"],
        "feature_list_ontology_id": ["UBERON:0002369", "UBERON:0005172"],
    }

    record.features.add_values(test_values)
    assert record.features.get_values() == test_values

    df = test_sheet.to_dataframe()
    result = df.to_dict(orient="records")[0]
    assert isinstance(result["feature_cell_lines"], list)
    assert isinstance(result["feature_list_ontology_id"], list)
    assert set(result["feature_cell_lines"]) == {"HEK293", "A-549"}
    assert set(result["feature_list_ontology_id"]) == {
        "UBERON:0002369",
        "UBERON:0005172",
    }

    # add another record
    record2 = ln.Record(name="test_record2", type=test_sheet).save()
    test_values2 = {
        "feature_cell_lines": ["HEK293"],
        "feature_list_ontology_id": ["UBERON:0005172"],
    }
    record2.features.add_values(test_values2)

    # trigger validation of the case that has two and a single record
    # this tests type casting in list-like values
    artifact = test_sheet.to_artifact()
    assert (
        len(artifact.schemas.first().members) == 2
    )  # this requires top most match filtering during validation

    record.delete(permanent=True)
    record2.delete(permanent=True)
    test_sheet.delete(permanent=True)
    inferred_schema = artifact.schemas.first()
    artifact.delete(permanent=True)
    inferred_schema.delete(permanent=True)
    schema.delete(permanent=True)
    feature_cell_lines.delete(permanent=True)
    feature_cell_lines_duplicate.delete(permanent=True)
    feature_type.delete(permanent=True)
    hek293.delete(permanent=True)
    a549.delete(permanent=True)
    uberon2369.delete(permanent=True)
    uberon5172.delete(permanent=True)


def test_record_feature_predicate_query():
    age = ln.Feature(name="pred_record_age", dtype=int).save()
    record_type = ln.Record(name="PredRecordType", is_type=True).save()
    record_a = ln.Record(name="pred_record_a", type=record_type).save()
    record_b = ln.Record(name="pred_record_b", type=record_type).save()
    record_a.features.add_values({"pred_record_age": 42})
    record_b.features.add_values({"pred_record_age": 10})

    assert ln.Record.filter(age > 40).one() == record_a
    assert ln.Record.filter(age <= 10).one() == record_b
    neq_results = ln.Record.filter(age != 42)
    assert record_b in neq_results
    assert record_a not in neq_results

    record_a.delete(permanent=True)
    record_b.delete(permanent=True)
    record_type.delete(permanent=True)
    age.delete(permanent=True)


def test_record_features_accept_feature_object_keys():
    feature_score = ln.Feature(name="record_feature_object_score", dtype=int).save()
    feature_tag = ln.Feature(name="record_feature_object_tag", dtype=str).save()
    record = ln.Record(name="record_feature_object_test").save()

    record.features.add_values({feature_score: 7, "record_feature_object_tag": "a"})
    assert record.features.get_values() == {
        "record_feature_object_score": 7,
        "record_feature_object_tag": "a",
    }

    # set_values should also accept Feature objects as dictionary keys.
    record.features.set_values({feature_tag: "b"})
    assert record.features.get_values() == {"record_feature_object_tag": "b"}

    record.features.add_values({feature_score: 9})
    assert record.features.get_values() == {
        "record_feature_object_score": 9,
        "record_feature_object_tag": "b",
    }

    # remove_values supports dictionary inputs with Feature keys.
    record.features.remove_values({feature_score: 9, feature_tag: None})
    assert record.features.get_values() == {}

    record.delete(permanent=True)
    feature_score.delete(permanent=True)
    feature_tag.delete(permanent=True)


================================================
FILE: tests/core/test_record_sheet_examples.py
================================================
import lamindb as ln
import pandas as pd
from lamindb.examples.fixtures.sheets import (
    populate_nextflow_sheet_with_samples,  # noqa: F401
    populate_sheets_compound_treatment,  # noqa: F401
)


def test_float_int_casting():
    # this test is only needed for as long as we let JS write data into RecordJson
    # for JS a 3 is a valid float even though any python json parser interprets it as an int
    feature_int = ln.Feature(name="feature_int", dtype=int).save()
    feature_float = ln.Feature(name="feature_float", dtype=float).save()
    test_schema = ln.Schema([feature_int, feature_float], name="test_schema").save()
    sheet = ln.Record(name="TestSheet", is_type=True, schema=test_schema).save()
    record = ln.Record(name="test_record", type=sheet).save()
    record.features.add_values({"feature_int": 5, "feature_float": 3.0})
    record_json = ln.models.RecordJson.get(record=record, feature=feature_float)
    record_json.value = 3
    record_json.save()
    df = sheet.to_dataframe()
    assert df["feature_int"].dtype.name == "int64"
    assert df["feature_float"].dtype.name == "float64"
    # this export call would error if we didn't have type casting
    artifact = sheet.to_artifact()

    related_schemas = list(artifact.schemas.all())
    artifact.schemas.clear()
    artifact.delete(permanent=True)
    record.delete(permanent=True)
    sheet.delete(permanent=True)
    for schema in related_schemas:
        schema.delete(permanent=True)
    # schema.delete(permanent=True), not necessary because already deleted above
    feature_float.delete(permanent=True)
    feature_int.delete(permanent=True)


def test_record_example_compound_treatment(
    populate_sheets_compound_treatment: tuple[ln.Record, ln.Record],  # noqa: F811
):
    treatments_sheet, sample_sheet1 = populate_sheets_compound_treatment

    dictionary = (
        ln.Record.filter(type=treatments_sheet)
        .to_dataframe()[["is_type", "name"]]
        .to_dict(orient="list")
    )
    assert dictionary == {
        "is_type": [
            False,
            False,
        ],
        "name": [
            "treatment2",
            "treatment1",
        ],
    }

    dictionary = (
        ln.Record.filter(type=treatments_sheet)
        .to_dataframe(features=True)[["compound", "concentration", "name"]]
        .to_dict(orient="list")
    )
    assert dictionary == {
        "compound": [
            "drug2",
            "drug1",
        ],
        "concentration": [
            "4nM",
            "2nM",
        ],
        "name": [
            "treatment2",
            "treatment1",
        ],
    }

    dictionary = (
        ln.Record.filter(type=sample_sheet1)
        .to_dataframe(features=["cell_line", "treatment"])[
            ["cell_line", "__lamindb_record_name__", "treatment"]
        ]
        .to_dict(orient="list")
    )
    assert dictionary == {
        "cell_line": [
            "HEK293T",
            "HEK293T",
        ],
        "__lamindb_record_name__": [
            "sample2",
            "sample1",
        ],
        "treatment": [
            "treatment2",
            "treatment1",
        ],
    }

    assert sample_sheet1.input_of_runs.count() == 0
    df = sample_sheet1.to_dataframe()
    assert sample_sheet1.input_of_runs.count() == 1
    assert df.index.name == "__lamindb_record_id__"
    dictionary = df[
        [
            "id",  # a feature
            "uid",  # a feature
            "name",  # a feature
            "cell_line",
            "treatment",
            "preparation_date",
            "__lamindb_record_name__",
        ]
    ].to_dict(orient="list")
    assert dictionary == {
        "id": [1, 2],
        "uid": ["S1", "S2"],
        "name": ["Sample 1", "Sample 2"],
        "cell_line": [
            "HEK293T",
            "HEK293T",
        ],
        "preparation_date": [
            pd.to_datetime("2025-06-01T05:00:00"),
            pd.to_datetime("2025-06-01T06:00:00"),
        ],
        "treatment": [
            "treatment1",
            "treatment2",
        ],
        "__lamindb_record_name__": [
            "sample1",
            "sample2",
        ],
    }

    artifact = sample_sheet1.to_artifact()
    assert sample_sheet1.schema.members.to_list("name") == [
        "id",
        "uid",
        "name",
        "treatment",
        "cell_line",
        "preparation_date",
        "project",
    ]
    assert artifact.run.input_records.count() == 1
    assert artifact.transform.kind == "function"
    assert artifact.transform.key == "__lamindb_record_export__"
    # looks something like this:
    # id,uid,name,treatment,cell_line,preparation_date,__lamindb_record_uid__,__lamindb_record_name__
    # 1,S1,Sample 1,treatment1,HEK293T,2025-06-01 05:00:00,iCwgKgZELoLtIoGy,sample1
    # 2,S2,Sample 2,treatment2,HEK293T,2025-06-01 06:00:00,qvU9m7VF6fSdsqJs,sample2
    assert len(artifact.load()) == 2  # two rows in the dataframe
    assert artifact.path.read_text().startswith("""\
id,uid,name,treatment,cell_line,preparation_date,project,__lamindb_record_uid__,__lamindb_record_name__
1,S1,Sample 1,treatment1,HEK293T,2025-06-01 05:00:00,Project 1""")
    assert artifact.key == f"sheet_exports/{sample_sheet1.name}.csv"
    assert artifact.description.startswith(f"Export of sheet {sample_sheet1.uid}")
    assert artifact._state.adding is False
    assert ln.models.ArtifactRecord.filter(artifact=artifact).count() == 2
    assert artifact.features.describe(return_str=True).endswith("""\
└── Dataset features
    └── columns (7)
        cell_line           bionty.CellLine          HEK293T
        id                  int
        name                str
        preparation_date    datetime
        project             Project                  Project 1
        treatment           Record[Treatment]        treatment1, treatment2
        uid                 str""")
    # re-run the export which triggers hash lookup
    sample_sheet1.to_artifact()
    # soft-delete a record in the sheet
    sample_sheet1.records.first().delete()
    assert ln.Record.filter(type=sample_sheet1).count() == 1
    df = sample_sheet1.to_dataframe()
    print(df)
    assert len(df) == 1  # one row in the dataframe

    artifact.delete(permanent=True)


def test_nextflow_sheet_with_samples(
    populate_nextflow_sheet_with_samples: ln.Record,  # noqa: F811
):
    """Test the example fixture for nextflow sheet with samples."""
    # This test is to ensure that the fixture works as expected
    # and that the data is correctly populated in the database.
    nextflow_sheet = populate_nextflow_sheet_with_samples

    df = nextflow_sheet.to_dataframe()

    assert df[
        ["expected_cells", "fastq_1", "fastq_2", "sample", "__lamindb_record_name__"]
    ].to_dict(orient="list") == {
        "expected_cells": [
            5000,
            5000,
            5000,
        ],
        "fastq_1": [
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz",
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R1_001.fastq.gz",
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R1_001.fastq.gz",
        ],
        "fastq_2": [
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz",
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R2_001.fastq.gz",
            "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R2_001.fastq.gz",
        ],
        "__lamindb_record_name__": [
            None,
            None,
            None,
        ],
        "sample": [
            "Sample_X",
            "Sample_Y",
            "Sample_Y",
        ],
    }

    assert nextflow_sheet.schema is not None
    artifact = nextflow_sheet.to_artifact()
    assert artifact.schema is nextflow_sheet.schema
    assert artifact._state.adding is False
    assert set(nextflow_sheet.schema.members.to_list("name")) == {
        "sample",
        "fastq_1",
        "fastq_2",
        "expected_cells",
        "seq_center",
    }
    assert set(artifact.features.slots["columns"].members.to_list("name")) == {
        "sample",
        "fastq_1",
        "fastq_2",
        "expected_cells",
        "seq_center",
    }
    assert artifact.path.read_text().startswith("""\
sample,fastq_1,fastq_2,expected_cells,seq_center,__lamindb_record_uid__,__lamindb_record_name__
Sample_X,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz,5000,,""")
    assert artifact.features.describe(return_str=True).endswith("""\
└── Dataset features
    └── columns (5)
        expected_cells      int
        fastq_1             str
        fastq_2             str
        sample              Record[BioSample]        Sample_X, Sample_Y
        seq_center          str""")

    related_schemas = list(artifact.schemas.all())
    artifact.schemas.clear()
    for schema in related_schemas:
        schema.delete(permanent=True)
    artifact.delete(permanent=True)


def test_record_soft_deleted_recreate():
    """Test that a soft-deleted record can be recreated with changes."""
    # testing soft delete and recreate with sqlite (postgres is tested in core/test_delete.py)
    # soft delete a record, then recreate it with some changes
    record = ln.Record(name="test_record").save()
    uid = record.uid
    assert record.branch_id == 1
    record.delete()
    assert record.branch_id == -1
    # now recreate the same record with the same uid but a different name
    record = ln.Record(name="test_record 2")
    record.uid = uid
    record.save()
    # now this record is recovered from the trash
    assert record.branch_id == 1
    assert record.name == "test_record 2"
    ln.Record.objects.filter().delete()


def test_annotate_with_user_feature():
    """Test that annotating with a user feature works as expected."""
    user_feature = ln.Feature(name="created_by", dtype=ln.User).save()
    schema = ln.Schema(
        name="test_schema_user_feature",
        features=[user_feature],
        coerce=True,
    ).save()
    sheet = ln.Record(name="A sheet with users", is_type=True, schema=schema).save()
    record = ln.Record(name="first user", type=sheet).save()
    user = ln.User(uid="abcdefgh", handle="test-user").save()
    ln.models.RecordUser(record=record, feature=user_feature, value=user).save()

    df = sheet.to_dataframe()
    assert df.index.name == "__lamindb_record_id__"
    assert df.columns.to_list() == [
        "created_by",
        "__lamindb_record_uid__",
        "__lamindb_record_name__",
    ]
    assert df.iloc[0]["created_by"] == "test-user"

    # clean up
    record.type = None
    record.save()
    record.delete(permanent=True)
    sheet.delete(permanent=True)
    schema.delete(permanent=True)
    user_feature.delete(permanent=True)
    user.delete(permanent=True)


def test_to_artifact_exports_all_records():
    # create sheet with >100 records, the default limit for to_dataframe
    sheet = ln.Record(name="LargeSheet", is_type=True).save()
    for i in range(101):
        ln.Record(name=f"record_{i}", type=sheet).save()
    df = sheet.to_dataframe()
    assert len(df) == 101, f"Expected 101 records, got {len(df)}"
    sheet.records.all().delete(permanent=True)
    sheet.delete(permanent=True)


def test_to_artifact_with_required_non_nullable_data_id_maximal_set_true():
    feature_data_id = ln.Feature(name="data_id", dtype=str, nullable=False).save()
    schema = ln.Schema(
        [feature_data_id],
        name="schema_with_required_data_id",
        maximal_set=True,
    ).save()
    sheet = ln.Record(name="SheetWithDataId", is_type=True, schema=schema).save()
    # Name is intentionally omitted to mirror sheet records in real-world pipelines.
    record = ln.Record(type=sheet).save()
    record.features.add_values({"data_id": "D1"})

    artifact = sheet.to_artifact()
    df = artifact.load()
    assert "data_id" in df.columns
    assert df["data_id"].to_list() == ["D1"]
    assert "__lamindb_record_name__" in df.columns
    assert df["__lamindb_record_name__"].isna().all()

    # clean up
    record.delete(permanent=True)
    sheet.delete(permanent=True)
    artifact.delete(permanent=True)
    schema.delete(permanent=True)
    feature_data_id.delete(permanent=True)


================================================
FILE: tests/core/test_rename_features_labels.py
================================================
import datetime
import os

import lamindb as ln
import pandas as pd
import pytest


def test_rename_feature(ccaplog):
    df = pd.DataFrame({"old_name": [1, 2]})
    ln.Feature(name="old_name", dtype=int).save()
    artifact = ln.Artifact.from_dataframe(
        df, key="test.parquet", schema="valid_features"
    ).save()
    feature = ln.Feature.get(name="old_name")

    # First rename
    feature.name = "new_name"
    feature.save()
    now1 = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0)
    assert (
        "by renaming feature from 'old_name' to 'new_name' 1 artifact no longer matches the feature name in storage:"
        in ccaplog.text
    )
    if os.getenv("LAMINDB_TEST_DB_VENDOR") != "sqlite":
        feature.refresh_from_db()
        assert feature.synonyms == "old_name"
        assert feature._aux["renamed"] == {
            now1.isoformat().replace("+00:00", "Z"): "old_name"
        }

    # Second rename
    feature.name = "newer_name"
    feature.save()
    now2 = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0)
    assert (
        "by renaming feature from 'new_name' to 'newer_name' 1 artifact no longer matches the feature name in storage:"
        in ccaplog.text
    )
    if os.getenv("LAMINDB_TEST_DB_VENDOR") != "sqlite":
        feature.refresh_from_db()
        assert feature.synonyms == "old_name|new_name"
        assert feature._aux["renamed"] == {
            now1.isoformat().replace("+00:00", "Z"): "old_name",
            now2.isoformat().replace("+00:00", "Z"): "new_name",
        }

    schema = artifact.schemas.first()
    artifact.delete(permanent=True)
    schema.delete(permanent=True)
    feature.delete(permanent=True)


@pytest.mark.parametrize("model_class", [ln.ULabel, ln.Record])
def test_rename_label(model_class, ccaplog):
    df = pd.DataFrame(
        {
            "feature1": pd.Categorical(["label1", "label2"]),
            "feature2": pd.Categorical(["label2", "label2"]),
        }
    )

    label1 = model_class(name="label1").save()
    label2 = model_class(name="label2").save()
    feature1 = ln.Feature(name="feature1", dtype=model_class).save()
    feature2 = ln.Feature(name="feature2", dtype=model_class).save()
    artifact = ln.Artifact.from_dataframe(
        df, key="test.parquet", schema="valid_features"
    ).save()

    label = model_class.get(name="label1")
    label.name = "label-renamed"
    label.save()

    assert (
        "by renaming label from 'label1' to 'label-renamed' 1 artifact no longer matches the label name in storage:"
        in ccaplog.text
    )

    schema = artifact.schemas.first()
    artifact.delete(permanent=True)
    schema.delete(permanent=True)
    feature1.delete(permanent=True)
    feature2.delete(permanent=True)
    label1.delete(permanent=True)
    label2.delete(permanent=True)


================================================
FILE: tests/core/test_run.py
================================================
import time

import lamindb as ln
import pytest


def test_run():
    with pytest.raises(ValueError) as error:
        ln.Run(1, 2)
    assert error.exconly() == "ValueError: Only one non-keyword arg allowed: transform"
    with pytest.raises(TypeError) as error:
        ln.Run()
    assert error.exconly() == "TypeError: Pass transform parameter"
    transform = ln.Transform(key="my_transform")
    with pytest.raises(ValueError) as error:
        ln.Run(transform)
    assert (
        error.exconly()
        == "ValueError: Please save transform record before creating a run"
    )
    transform.save()
    run = ln.Run(transform).save()
    assert run.status == "scheduled"
    assert run.reference is None
    assert run.reference_type is None
    run2 = ln.Run(transform, reference="test1", reference_type="test2").save()
    assert run2.reference == "test1"
    assert run2.reference_type == "test2"
    assert run.uid != run2.uid
    run.delete(permanent=True)

    report_artifact = ln.Artifact(
        "README.md", kind="__lamindb_run__", description="report of run2"
    ).save()
    run2.report = report_artifact
    environment = ln.Artifact(
        "CONTRIBUTING.md", kind="__lamindb_run__", description="requirements.txt"
    ).save()
    run2.environment = environment
    run2.save()

    # report/env artifacts will be cleaned up in background subprocess
    run2.delete(permanent=True)
    assert ln.Run.filter(uid=run2.uid).count() == 0
    # report/env are still present in the database
    assert ln.Artifact.filter(uid=report_artifact.uid).count() == 1
    assert ln.Artifact.filter(uid=environment.uid).count() == 1

    transform.delete(permanent=True)
    assert ln.Run.filter(uid=run.uid).count() == 0

    # wait for background cleanup subprocess to delete artifacts
    time.sleep(4)
    assert ln.Artifact.filter(uid=report_artifact.uid).count() == 0
    assert ln.Artifact.filter(uid=environment.uid).count() == 0


def test_bulk_permanent_run_delete(tmp_path):
    transform = ln.Transform(key="Bulk run delete transform").save()
    n_runs = 2
    report_files = [tmp_path / f"report_{i}.txt" for i in range(n_runs)]
    for i, path in enumerate(report_files):
        path.write_text(f"content {i}")
    report_artifacts = [
        ln.Artifact(path, kind="__lamindb_run__", description=f"report {i}").save()
        for i, path in enumerate(report_files)
    ]
    runs = [ln.Run(transform, report=af).save() for af in report_artifacts]
    run_ids = [r.id for r in runs]
    ln.settings.verbosity = "debug"
    ln.Run.filter(id__in=run_ids).order_by("created_at").delete(permanent=True)
    assert ln.Run.filter(id__in=run_ids).count() == 0
    assert ln.Artifact.filter(uid=report_artifacts[0].uid).count() == 1
    transform.delete(permanent=True)

    # wait for background cleanup subprocess to delete artifacts
    time.sleep(4)
    assert ln.Artifact.filter(uid=report_artifacts[0].uid).count() == 0
    clean_up_logs = ln.setup.settings.cache_dir / f"run_cleanup_logs_{runs[0].uid}.txt"
    assert f"deleted artifact {report_artifacts[0].id}" in clean_up_logs.read_text()


================================================
FILE: tests/core/test_save.py
================================================
# ruff: noqa: F811

import lamindb as ln
import pytest
from _dataset_fixtures import (  # noqa
    get_mini_csv,
)
from lamindb.models.save import prepare_error_message, store_artifacts


def test_bulk_save_and_update():
    label_names = [f"Record {i} new" for i in range(3)]
    labels = [ln.Record(name=name) for name in label_names]
    # test bulk creation of new records
    ln.save(labels)
    assert len(ln.Record.filter(name__in=label_names).distinct()) == 3
    labels[0].name = "Record 0 updated"
    # test bulk update of existing records
    ln.save(labels)
    assert len(ln.Record.filter(name__in=label_names).distinct()) == 2
    assert ln.Record.get(name="Record 0 updated")


def test_prepare_error_message(get_mini_csv):
    artifact = ln.Artifact(get_mini_csv, description="test")
    exception = Exception("exception")

    error = prepare_error_message([], [artifact], exception)
    assert error.startswith(
        "The following entries have been successfully uploaded and committed to the database"
    )

    error = prepare_error_message([artifact], [], exception)
    assert error.startswith("No entries were uploaded or committed to the database")


def test_save_data_object(get_mini_csv):
    artifact = ln.Artifact(get_mini_csv, description="test")
    artifact.save()
    assert artifact.path.exists()
    artifact.delete(permanent=True, storage=True)


def test_store_artifacts_acid(get_mini_csv):
    artifact = ln.Artifact(get_mini_csv, description="test")
    artifact._clear_storagekey = "test.csv"
    # errors on check_and_attempt_clearing
    with pytest.raises(FileNotFoundError):
        artifact.save()

    with pytest.raises(RuntimeError) as error:
        store_artifacts([artifact], using_key=None)
    assert str(error.exconly()).startswith(
        "RuntimeError: The following entries have been successfully uploaded"
    )

    artifact.delete(permanent=True)


def test_save_parents():
    import bionty as bt

    bt.CellType.from_values(["B cell", "T cell"]).save()
    assert bt.CellType.get(name="B cell").parents.to_dataframe().shape[0] == 1
    bt.CellType.filter().delete(permanent=True)


def test_save_batch_size():
    label_names = [f"Record {i} batch_size" for i in range(3)]
    labels = [ln.Record(name=name) for name in label_names]
    # test bulk creation of new records with batch size
    ln.save(labels, batch_size=2)
    assert ln.Record.filter(name__in=label_names).distinct().count() == 3


def test_bulk_save_lazy_record_features():
    cell_type = ln.Record(name="lazy-cell-type", is_type=True).save()
    ln.Record(name="lazy-b-cell", type=cell_type).save()
    ln.Record(name="lazy-t-cell", type=cell_type).save()
    score_feature = ln.Feature(name="lazy-bulk-score", dtype=float).save()
    cell_feature = ln.Feature(name="lazy-bulk-cell", dtype=cell_type).save()
    schema = ln.Schema([score_feature, cell_feature], name="lazy-bulk-schema").save()
    sheet = ln.Record(name="lazy-sheet", is_type=True, schema=schema).save()

    records = [
        ln.Record(
            name="lazy-sample-1",
            type=sheet,
            features={"lazy-bulk-score": 0.1, "lazy-bulk-cell": "lazy-b-cell"},
        ),
        ln.Record(
            name="lazy-sample-2",
            type=sheet,
            features={"lazy-bulk-score": 0.2, "lazy-bulk-cell": "lazy-t-cell"},
        ),
    ]
    ln.save(records)

    sample_1 = ln.Record.get(name="lazy-sample-1")
    sample_2 = ln.Record.get(name="lazy-sample-2")
    sample_1_values = sample_1.features.get_values()
    sample_2_values = sample_2.features.get_values()
    assert sample_1_values["lazy-bulk-score"] == 0.1
    assert sample_2_values["lazy-bulk-score"] == 0.2
    assert sample_1_values["lazy-bulk-cell"] == "lazy-b-cell"
    assert sample_2_values["lazy-bulk-cell"] == "lazy-t-cell"
    assert not hasattr(records[0], "_features")
    assert not hasattr(records[1], "_features")

    ln.Record.filter(name__in=["lazy-sample-1", "lazy-sample-2"]).delete(permanent=True)
    ln.Record.filter(name="lazy-sheet").delete(permanent=True)
    ln.Record.filter(name__in=["lazy-b-cell", "lazy-t-cell"]).delete(permanent=True)
    ln.Record.filter(name="lazy-cell-type").delete(permanent=True)
    schema.delete(permanent=True)
    score_feature.delete(permanent=True)
    cell_feature.delete(permanent=True)


def test_bulk_save_lazy_record_features_requires_same_schema():
    feature_a = ln.Feature(name="lazy-schema-a", dtype=float).save()
    feature_b = ln.Feature(name="lazy-schema-b", dtype=float).save()
    schema_a = ln.Schema([feature_a], name="lazy-schema-a").save()
    schema_b = ln.Schema([feature_b], name="lazy-schema-b").save()
    type_a = ln.Record(name="lazy-type-a", is_type=True, schema=schema_a).save()
    type_b = ln.Record(name="lazy-type-b", is_type=True, schema=schema_b).save()

    records = [
        ln.Record(name="lazy-mixed-1", type=type_a, features={"lazy-schema-a": 1.0}),
        ln.Record(name="lazy-mixed-2", type=type_b, features={"lazy-schema-b": 2.0}),
    ]
    with pytest.raises(
        ln.errors.ValidationError,
        match="same type schema",
    ):
        ln.save(records)

    ln.Record.filter(name__in=["lazy-mixed-1", "lazy-mixed-2"]).delete(permanent=True)
    ln.Record.filter(name__in=["lazy-type-a", "lazy-type-b"]).delete(permanent=True)
    schema_a.delete(permanent=True)
    schema_b.delete(permanent=True)
    feature_a.delete(permanent=True)
    feature_b.delete(permanent=True)


def test_bulk_save_lazy_record_features_requires_schema():
    unschematized_type = ln.Record(name="lazy-no-schema-type", is_type=True).save()

    records = [
        ln.Record(
            name="lazy-no-schema-1", type=unschematized_type, features={"foo": 1.0}
        )
    ]
    with pytest.raises(
        ln.errors.ValidationError,
        match="same non-null type schema",
    ):
        ln.save(records)

    ln.Record.filter(name="lazy-no-schema-1").delete(permanent=True)
    ln.Record.filter(name="lazy-no-schema-type").delete(permanent=True)


def test_bulk_resave_trashed_records():
    import bionty as bt

    # first create records from public source
    records = bt.Ethnicity.from_values(["asian", "white"]).save()
    assert len(records) == 2
    # parents are also created
    ethnicities = bt.Ethnicity.filter()
    assert ethnicities.count() > 2
    # soft delete the records including parent
    ethnicities.delete()
    # then create them again from public source
    # the new records will now have the same uids as they are hashed from the ontology_ids
    assert bt.Ethnicity.filter().count() == 0
    new_records = bt.Ethnicity.from_values(["asian", "white", "african"])
    assert new_records[0].branch_id == 1
    assert new_records[0].uid == records[0].uid
    # after saving, the trashed records should be restored
    new_records.save()
    assert new_records[0].branch_id == 1
    ethnicities = bt.Ethnicity.filter()
    # the parent should also be restored
    assert ethnicities.count() > 3

    # clean up
    ethnicities.delete(permanent=True)


================================================
FILE: tests/core/test_schema.py
================================================
import bionty as bt
import lamindb as ln
import pandas as pd
import pytest
from django.db.utils import IntegrityError
from lamindb.errors import FieldValidationError, InvalidArgument, ValidationError
from lamindb.models.schema import get_related_name, validate_features


@pytest.fixture(scope="module")
def df():
    return pd.DataFrame(
        {
            "feat1": [1, 2, 3],
            "feat2": [3, 4, 5],
            "feat3": ["cond1", "cond2", "cond2"],
            "feat4": ["id1", "id2", "id3"],
        }
    )


def test_schema_from_values():
    gene_symbols = ["TCF7", "MYC"]
    bt.Gene.filter(symbol__in=gene_symbols).delete(permanent=True)
    with pytest.raises(ValidationError) as error:
        schema = ln.Schema.from_values(
            gene_symbols, bt.Gene.symbol, dtype=int, organism="human"
        )
    assert error.exconly().startswith(
        "lamindb.errors.ValidationError: These values could not be validated:"
    )
    ln.save(bt.Gene.from_values(gene_symbols, "symbol", organism="human"))
    schema = ln.Schema.from_values(gene_symbols, bt.Gene.symbol, organism="human")
    # below should be a queryset and not a list
    assert set(schema.members) == set(
        bt.Gene.from_values(gene_symbols, "symbol", organism="human")
    )
    assert schema.dtype == "num"  # this is NUMBER_TYPE
    schema = ln.Schema.from_values(
        gene_symbols, bt.Gene.symbol, dtype=int, organism="human"
    )
    assert schema._state.adding
    assert schema.dtype == "int"
    assert schema.itype == "bionty.Gene"
    schema.save()
    assert set(schema.members) == set(schema.genes.all())
    id = schema.id
    # test that the schema is retrieved from the database
    # in case it already exists
    schema = ln.Schema.from_values(
        gene_symbols, bt.Gene.symbol, dtype=int, organism="human"
    )
    assert not schema._state.adding
    assert id == schema.id
    schema.delete(permanent=True)

    # edge cases
    with pytest.raises(ValueError):
        schema = ln.Schema.from_values([])
    with pytest.raises(TypeError):
        ln.Schema.from_values(["a"], field="name")
    with pytest.raises(ValidationError):
        schema = ln.Schema.from_values(
            ["weird_name"], field=ln.Feature.name, dtype="float"
        )


def test_schema_from_records(df):
    features = ln.Feature.from_dataframe(df)
    with pytest.raises(ValueError) as error:
        schema = ln.Schema(features)
    assert (
        error.exconly()
        == "ValueError: Can only construct feature sets from validated features"
    )

    ln.save(features)
    schema = ln.Schema(features)
    assert schema.id is None
    assert schema._state.adding
    assert schema.dtype is None
    assert schema.itype == "Feature"
    schema.save()
    # test that the schema is retrieved from the database
    # in case it already exists
    schema = ln.Schema(features)
    assert not schema._state.adding
    assert schema.id is not None
    schema.delete(permanent=True)

    # edge case
    with pytest.raises(ValueError):
        positional_arg = 1
        ln.Schema(features, positional_arg)


def test_schema_from_df(df):
    # test using type
    human = bt.Organism.from_source(name="human").save()
    genes = [bt.Gene(symbol=name, organism=human) for name in df.columns]
    ln.save(genes)
    with pytest.raises(ValueError) as error:
        ln.Schema.from_dataframe(df, field=bt.Gene.symbol)
    assert error.exconly().startswith("ValueError: data types are heterogeneous:")
    schema = ln.Schema.from_dataframe(df[["feat1", "feat2"]], field=bt.Gene.symbol)
    for gene in genes:
        gene.delete(permanent=True)

    # now for the features registry
    features = ln.Feature.from_dataframe(df)
    ln.save(features)
    schema = ln.Schema.from_dataframe(df).save()
    assert schema.dtype is None
    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


def test_get_related_name():
    with pytest.raises(ValueError):
        get_related_name(ln.Transform)


def test_validate_features():
    with pytest.raises(ValueError):
        validate_features([])
    with pytest.raises(TypeError):
        validate_features(["feature"])
    with pytest.raises(TypeError):
        validate_features({"feature"})
    transform = ln.Transform(key="test").save()
    # This is just a type check
    with pytest.raises(TypeError) as error:
        validate_features([transform, ln.Run(transform)])
    assert error.exconly() == "TypeError: schema can only contain a single type"
    transform.delete(permanent=True)


def test_kwargs():
    with pytest.raises(FieldValidationError):
        ln.Schema(x="1", features=[])


def test_edge_cases():
    feature = ln.Feature(name="rna", dtype="float")
    ln.save([feature])
    with pytest.raises(ValueError) as error:
        ln.Schema(feature)
    assert (
        error.exconly()
        == "ValueError: Please pass a ListLike of features, not a single feature"
    )
    feature.delete(permanent=True)


@pytest.fixture(scope="module")
def mini_immuno_schema_flexible():
    schema = ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible()

    yield schema

    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)
    bt.Gene.filter().delete(permanent=True)
    ln.Record.filter(type__isnull=False).delete(permanent=True)
    ln.Record.filter().delete(permanent=True)
    bt.CellType.filter().delete(permanent=True)


def test_schema_update_implicit_through_name_equality(
    mini_immuno_schema_flexible: ln.Schema,
    ccaplog,
):
    df = pd.DataFrame({"a": [1]})
    artifact = ln.Artifact.from_dataframe(df, key="test_artifact.parquet").save()
    artifact.schema = mini_immuno_schema_flexible
    artifact.save()

    orig_hash = mini_immuno_schema_flexible.hash
    warning_message = "you updated the schema hash and might invalidate datasets that were previously validated with this schema:"

    # different numbers of features -------------------------------------------

    schema = ln.Schema(
        name="Mini immuno schema",
        features=[
            ln.Feature.get(name="perturbation"),
            ln.Feature.get(name="donor"),
        ],
    ).save()

    assert schema.hash != orig_hash
    assert ccaplog.text.count(warning_message) == 1

    # change is flexible (an auxiliary field) --------------------------------

    schema = ln.Schema(
        name="Mini immuno schema",
        features=[
            ln.Feature.get(name="perturbation"),
            ln.Feature.get(name="cell_type_by_model"),
            ln.Feature.get(name="assay_oid"),
            ln.Feature.get(name="donor"),
            ln.Feature.get(name="concentration"),
            ln.Feature.get(name="treatment_time_h"),
        ],
        flexible=True,
    ).save()

    assert schema.hash == orig_hash  # restored original hash
    assert ccaplog.text.count(warning_message) == 2  # warning raised

    schema = ln.Schema(
        name="Mini immuno schema",
        features=[
            ln.Feature.get(name="perturbation"),
            ln.Feature.get(name="cell_type_by_model"),
            ln.Feature.get(name="assay_oid"),
            ln.Feature.get(name="donor"),
            ln.Feature.get(name="concentration"),
            ln.Feature.get(name="treatment_time_h"),
        ],
        flexible=False,
    ).save()

    assert schema.hash != orig_hash
    assert ccaplog.text.count(warning_message) == 3  # warning raised
    ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible()

    artifact.delete(permanent=True)

    # restore original hash  --------------------------------

    schema = ln.Schema(
        name="Mini immuno schema",
        features=[
            ln.Feature.get(name="perturbation"),
            ln.Feature.get(name="cell_type_by_model"),
            ln.Feature.get(name="assay_oid"),
            ln.Feature.get(name="donor"),
            ln.Feature.get(name="concentration"),
            ln.Feature.get(name="treatment_time_h"),
        ],
        flexible=True,
    ).save()

    assert schema.hash == orig_hash  # restored original hash


def test_schema_update(
    mini_immuno_schema_flexible: ln.Schema,
    ccaplog,
):
    df = pd.DataFrame({"a": [1]})
    artifact = ln.Artifact.from_dataframe(df, key="test_artifact.parquet").save()
    artifact.schema = mini_immuno_schema_flexible
    artifact.save()

    # store original hash

    orig_hash = mini_immuno_schema_flexible.hash
    warning_message = "you updated the schema hash and might invalidate datasets that were previously validated with this schema:"

    # add a feature -------------------------------------------

    feature_to_add = ln.Feature(name="sample_note", dtype=str).save()
    assert mini_immuno_schema_flexible.n_members == 6
    mini_immuno_schema_flexible.features.add(feature_to_add)
    mini_immuno_schema_flexible.save()
    assert mini_immuno_schema_flexible.hash != orig_hash
    assert mini_immuno_schema_flexible.n_members == 7
    assert ccaplog.text.count(warning_message) == 1

    # remove the feature again
    mini_immuno_schema_flexible.features.remove(feature_to_add)
    mini_immuno_schema_flexible.save()
    assert mini_immuno_schema_flexible.hash == orig_hash
    assert ccaplog.text.count(warning_message) == 2
    assert mini_immuno_schema_flexible.n_members == 6
    feature_to_add.delete(permanent=True)

    # change is flexible (an auxiliary field) --------------------------------

    assert mini_immuno_schema_flexible.flexible
    mini_immuno_schema_flexible.flexible = False
    mini_immuno_schema_flexible.save()
    assert mini_immuno_schema_flexible.hash != orig_hash
    assert ccaplog.text.count(warning_message) == 3

    # restore original setting
    mini_immuno_schema_flexible.flexible = True
    mini_immuno_schema_flexible.save()
    assert mini_immuno_schema_flexible.hash == orig_hash
    assert ccaplog.text.count(warning_message) == 4

    # change coerce (formerly auxiliary field, now Django field) --------------------------------

    assert not mini_immuno_schema_flexible.coerce
    mini_immuno_schema_flexible.coerce = True
    mini_immuno_schema_flexible.save()
    assert mini_immuno_schema_flexible.hash != orig_hash
    assert ccaplog.text.count(warning_message) == 5

    # restore original setting
    mini_immuno_schema_flexible.coerce = False
    mini_immuno_schema_flexible.save()
    assert mini_immuno_schema_flexible.hash == orig_hash
    assert ccaplog.text.count(warning_message) == 6

    # add an index --------------------------------

    index_feature = ln.Feature(name="immuno_sample", dtype=str).save()
    mini_immuno_schema_flexible.index = index_feature
    mini_immuno_schema_flexible.save()
    assert mini_immuno_schema_flexible.hash != orig_hash
    assert mini_immuno_schema_flexible.n_members == 7
    assert ccaplog.text.count(warning_message) == 7

    # remove the index
    mini_immuno_schema_flexible.index = None
    mini_immuno_schema_flexible.save()
    assert mini_immuno_schema_flexible.n_members == 6
    assert mini_immuno_schema_flexible.hash == orig_hash
    assert ccaplog.text.count(warning_message) == 8
    index_feature.delete(permanent=True)

    # make a feature optional --------------------------------

    required_feature = mini_immuno_schema_flexible.features.first()
    mini_immuno_schema_flexible.optionals.add(required_feature)
    mini_immuno_schema_flexible.save()
    assert mini_immuno_schema_flexible.hash != orig_hash
    assert ccaplog.text.count(warning_message) == 9

    # make it required again
    mini_immuno_schema_flexible.optionals.remove(required_feature)
    mini_immuno_schema_flexible.save()
    assert mini_immuno_schema_flexible.hash == orig_hash
    assert ccaplog.text.count(warning_message) == 10

    artifact.delete(permanent=True)


def test_schema_mutations_feature_removal(
    mini_immuno_schema_flexible: ln.Schema, ccaplog
):
    feature1 = ln.Feature.get(name="perturbation")
    feature2 = ln.Feature.get(name="cell_type_by_model")
    dummy_artifact = ln.Artifact(".gitignore", key=".gitignore").save()
    # define the schema the first time
    schema = ln.Schema(name="My test schema X", features=[feature1, feature2]).save()
    assert schema.features.count() == 2
    dummy_artifact.schema = schema  # pretend artifact was validated with this schema
    dummy_artifact.save()
    # define the schema the first time
    schema1 = ln.Schema(name="My test schema X", features=[feature2]).save()
    # retrieves same schema because of name equality
    assert ccaplog.text.count("you're removing these features:") == 1
    assert (
        ccaplog.text.count("you updated the schema hash and might invalidate datasets")
        == 1
    )
    assert schema1 == schema
    assert schema1.features.count() == 1
    dummy_artifact.delete(permanent=True)
    schema.delete(permanent=True)


def test_schema_add_remove_optional_features(mini_immuno_schema_flexible: ln.Schema):
    schema = mini_immuno_schema_flexible
    initial_hash = schema.hash
    feature_project = ln.Feature(name="project", dtype=ln.Project).save()
    schema.add_optional_features([feature_project])
    assert schema.hash != initial_hash
    schema.remove_optional_features([feature_project])
    assert schema.hash == initial_hash


def test_schema_components(mini_immuno_schema_flexible: ln.Schema):
    obs_schema = mini_immuno_schema_flexible
    var_schema = ln.Schema(
        name="scRNA_seq_var_schema",
        itype=bt.Gene.ensembl_gene_id,
        dtype="num",
    ).save()

    # test recreation of schema based on name lookup
    var_schema2 = ln.Schema(
        name="scRNA_seq_var_schema",
        itype=bt.Gene.ensembl_gene_id,
        dtype="num",
    ).save()
    assert var_schema == var_schema2

    with pytest.raises(InvalidArgument) as error:
        ln.Schema(
            name="mini_immuno_anndata_schema",
            slots={"obs": obs_schema, "var": var_schema},
        ).save()
    assert str(error.value) == "Please pass otype != None for composite schemas"

    anndata_schema = ln.Schema(
        name="mini_immuno_anndata_schema",
        otype="AnnData",
        slots={"obs": obs_schema, "var": var_schema},
    ).save()

    var_schema2 = ln.Schema(
        name="symbol_var_schema",
        itype=bt.Gene.symbol,
        dtype="num",
    ).save()
    # try adding another schema under slot "var"
    # we want to trigger the unique constraint on slot
    with pytest.raises(IntegrityError) as error:
        anndata_schema.components.add(  # type: ignore
            var_schema2, through_defaults={"slot": "var"}
        )
    assert "unique" in str(error.value).lower()

    anndata_schema.delete(permanent=True)
    var_schema2.delete(permanent=True)
    var_schema.delete(permanent=True)


def test_mini_immuno_schema_flexible(mini_immuno_schema_flexible):
    schema = ln.Schema(
        name="Mini immuno schema",
        features=[
            ln.Feature.get(name="perturbation"),
            ln.Feature.get(name="cell_type_by_model"),
            ln.Feature.get(name="assay_oid"),
            ln.Feature.get(name="donor"),
            ln.Feature.get(name="concentration"),
            ln.Feature.get(name="treatment_time_h"),
        ],
        flexible=True,  # _additional_ columns in a dataframe are validated & annotated
    )
    assert schema.name == "Mini immuno schema"
    assert schema.itype == "Feature"
    assert (
        schema._list_for_hashing[:6]
        == [
            "b=Feature",
            "c=True",
            "d=False",
            "e=False",
            "f=True",
            "h=6",
            "j=HASH_OF_FEATURE_UIDS",  # this last hash is not deterministic in a unit test
        ][:6]
    )


def test_schema_recovery_based_on_hash(mini_immuno_schema_flexible: ln.Schema):
    feature1 = ln.Feature.get(name="perturbation")
    feature2 = ln.Feature.get(name="cell_type_by_model")
    schema = ln.Schema(features=[feature1, feature2]).save()
    schema2 = ln.Schema(features=[feature1, feature2])
    assert schema == schema2
    schema.delete()
    schema2 = ln.Schema(features=[feature1, feature2])
    assert schema != schema2
    schema.delete(permanent=True)


def test_schemas_dataframe():
    # test on the Python level after record creation -- no saving!
    schema = ln.Schema(name="valid_features", itype=ln.Feature)
    assert schema.name == "valid_features"
    assert schema.itype == "Feature"
    assert schema._list_for_hashing == [
        "b=Feature",
        "c=True",
        "d=False",
        "e=False",
    ]
    assert schema.hash == "kMi7B_N88uu-YnbTLDU-DA"

    # test the convenience function
    schema = ln.examples.schemas.valid_features()
    assert schema.uid == "0000000000000000"
    assert schema.name == "valid_features"
    assert schema.itype == "Feature"
    assert schema.hash == "kMi7B_N88uu-YnbTLDU-DA"


def test_schemas_anndata():
    # test on the Python level after record creation -- no saving!
    obs_schema = ln.examples.schemas.valid_features()
    varT_schema = ln.Schema(
        name="valid_ensembl_gene_ids", itype=bt.Gene.ensembl_gene_id
    )
    assert varT_schema._list_for_hashing == [
        "a=num",
        "b=bionty.Gene.ensembl_gene_id",
        "c=True",
        "d=False",
        "e=False",
    ]
    assert varT_schema.name == "valid_ensembl_gene_ids"
    assert varT_schema.itype == "bionty.Gene.ensembl_gene_id"
    assert varT_schema.hash == "1gocc_TJ1RU2bMwDRK-WUA"
    schema = ln.Schema(
        name="anndata_ensembl_gene_ids_and_valid_features_in_obs",
        otype="AnnData",
        slots={"obs": obs_schema, "var.T": varT_schema.save()},
    )
    assert schema._list_for_hashing == [
        "a=num",
        "c=True",
        "d=False",
        "e=False",
        "l=GPZ-TzvKRhdC1PQAhlFiow",
    ]
    assert schema.name == "anndata_ensembl_gene_ids_and_valid_features_in_obs"
    assert schema.itype is None
    assert schema.hash == "aqGWHvyY49W_PHELUMiBMw"

    # test the convenience function
    schema = ln.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs()
    assert schema.uid == "0000000000000002"
    assert schema.name == "anndata_ensembl_gene_ids_and_valid_features_in_obs"
    assert schema.itype is None
    assert schema.hash == "aqGWHvyY49W_PHELUMiBMw"
    varT_schema = schema.slots["var.T"]
    assert varT_schema.uid == "0000000000000001"
    assert varT_schema.name == "valid_ensembl_gene_ids"
    assert varT_schema.itype == "bionty.Gene.ensembl_gene_id"
    assert varT_schema.hash == "1gocc_TJ1RU2bMwDRK-WUA"

    schema.delete(permanent=True)


def test_schema_already_saved_aux():
    """When attempting to save a Schema that was already saved before which populated `_aux` fields,
    we expect the Schema to be returned with the same `_aux` fields.

    Test for https://github.com/laminlabs/lamindb/issues/2887
    """
    var_schema = ln.Schema(
        name="test var",
        index=ln.Feature(
            name="var_index",
            dtype=bt.Gene.ensembl_gene_id,
            cat_filters={
                "source": bt.Source.get(
                    entity="bionty.Gene", currently_used=True, organism="human"
                )
            },
        ).save(),
        itype=ln.Feature,
        dtype="DataFrame",
        minimal_set=True,
        coerce=True,
    ).save()

    schema = ln.Schema(
        name="AnnData schema",
        otype="AnnData",
        minimal_set=True,
        coerce=True,
        slots={"var": var_schema},
    ).save()

    # _aux["af"] now only contains key "3" (index_feature_uid) since coerce and flexible are Django fields
    assert len(schema.slots["var"]._aux["af"].keys()) == 1
    assert "3" in schema.slots["var"]._aux["af"]  # index_feature_uid
    # coerce and flexible are now proper Django fields
    assert schema.slots["var"].coerce is True
    assert schema.slots["var"].flexible is False

    # Attempting to save the same schema again should return the Schema with the same fields
    var_schema_2 = ln.Schema(
        name="test var",
        index=ln.Feature(
            name="var_index",
            dtype=bt.Gene.ensembl_gene_id,
            cat_filters={
                "source": bt.Source.get(
                    entity="bionty.Gene", currently_used=True, organism="human"
                )
            },
        ).save(),
        itype=ln.Feature,
        dtype="DataFrame",
        minimal_set=True,
        coerce=True,
    ).save()

    schema_2 = ln.Schema(
        name="AnnData schema",
        otype="AnnData",
        minimal_set=True,
        coerce=True,
        slots={"var": var_schema_2},
    ).save()

    assert len(schema.slots["var"]._aux["af"].keys()) == 1
    assert schema.slots["var"]._aux == schema_2.slots["var"]._aux
    assert schema.slots["var"].coerce == schema_2.slots["var"].coerce
    assert schema.slots["var"].flexible == schema_2.slots["var"].flexible

    schema_2.delete(permanent=True)
    schema.delete(permanent=True)


def test_schema_not_saved_describe():
    schema = ln.Schema(name="NotSavedSchema", is_type=True)
    with pytest.raises(ValueError) as e:
        schema.describe()
    assert "Schema must be saved before describing" in str(e.value)


def test_schema_is_type():
    Sample = ln.Schema(name="Sample", is_type=True).save()
    assert Sample.hash is None
    BioSample = ln.Schema(name="BioSample", is_type=True, type=Sample).save()
    assert BioSample.hash is None
    assert BioSample.type == Sample
    assert BioSample.is_type
    # create a schema without any features or slots or itype or is_type=True
    with pytest.raises(InvalidArgument) as e:
        ln.Schema(name="TechSample", type=Sample)
    assert "Please pass features or slots or itype or set is_type=True" in str(e.value)
    # clean up
    BioSample.delete(permanent=True)
    Sample.delete(permanent=True)


# see test_component_composite in test_transform.py
def test_composite_component():
    composite = ln.Schema(name="composite", itype=ln.Feature).save()
    component1 = ln.Schema(name="component1", itype=bt.CellType).save()
    component2 = ln.Schema(name="component2", itype=bt.CellMarker).save()
    composite.components.add(component1, through_defaults={"slot": "slot1"})
    composite.components.add(component2, through_defaults={"slot": "slot2"})

    assert len(composite.components.all()) == 2
    assert composite.links_component.count() == 2
    assert set(composite.links_component.all().to_list("slot")) == {"slot1", "slot2"}
    assert composite.links_component.first().composite == composite
    assert composite.composites.count() == 0
    assert composite.links_composite.count() == 0

    ln.models.SchemaComponent.filter(composite=composite).delete(permanent=True)

    link = ln.models.SchemaComponent(
        composite=composite, component=component1, slot="var"
    ).save()
    assert link in composite.links_component.all()
    assert link in component1.links_composite.all()
    assert link.slot == "var"

    composite.delete(permanent=True)
    component1.delete(permanent=True)
    component2.delete(permanent=True)

    assert ln.models.SchemaComponent.filter().count() == 0


def test_schema_describe_bracket_names():
    """Feature names with brackets like 'characteristics[organism]' must appear verbatim in describe output.

    Regression test for Rich interpreting '[...]' as markup tags and swallowing bracket content.
    """
    features = [
        ln.Feature(name="source name", dtype="str").save(),
        ln.Feature(name="characteristics[organism]", dtype="str").save(),
        ln.Feature(name="characteristics[disease]", dtype="str").save(),
        ln.Feature(name="comment[instrument]", dtype="str").save(),
    ]
    schema = ln.Schema(features, name="test_brackets").save()
    result = schema.describe(return_str=True)
    assert "characteristics[organism]" in result
    assert "characteristics[disease]" in result
    assert "comment[instrument]" in result

    schema.delete(permanent=True)
    for feature in features:
        feature.delete(permanent=True)


================================================
FILE: tests/core/test_search.py
================================================
import bionty as bt
import lamindb as ln
import pytest


@pytest.fixture(scope="module")
def prepare_cell_type_registry():
    bt.CellType.filter().delete(permanent=True)
    records = [
        {
            "ontology_id": "CL:0000084",
            "name": "T cell",
            "synonyms": "T-cell|T-lymphocyte|T lymphocyte",
            "children": ["CL:0000798", "CL:0002420", "CL:0002419", "CL:0000789"],
        },
        {
            "ontology_id": "CL:0000236",
            "name": "B cell",
            "synonyms": "B-lymphocyte|B lymphocyte|B-cell",
            "children": ["CL:0009114", "CL:0001201"],
        },
        {
            "ontology_id": "CL:0000696",
            "name": "PP cell",
            "synonyms": "type F enteroendocrine cell",
            "children": ["CL:0002680"],
        },
        {
            "ontology_id": "CL:0002072",
            "name": "nodal myocyte",
            "synonyms": "P cell|myocytus nodalis|cardiac pacemaker cell",
            "children": ["CL:1000409", "CL:1000410"],
        },
    ]
    public_records = []
    for ref_record in records:
        record = bt.CellType.from_source(ontology_id=ref_record["ontology_id"])
        assert record.name == ref_record["name"]
        assert set(record.synonyms.split("|")) == set(ref_record["synonyms"].split("|"))
        public_records.append(record)
    ln.save(public_records)
    yield "prepared"
    bt.CellType.filter().delete(permanent=True)


def test_search_synonyms(prepare_cell_type_registry):
    result = bt.CellType.search("P cell").to_dataframe()
    assert set(result.name.iloc[:2]) == {"nodal myocyte", "PP cell"}


def test_search_limit(prepare_cell_type_registry):
    result = bt.CellType.search("P cell", limit=1).to_dataframe()
    assert len(result) == 1


def test_search_case_sensitive(prepare_cell_type_registry):
    result = bt.CellType.search("b cell", case_sensitive=False).to_dataframe()
    assert result.name.iloc[0] == "B cell"


def test_search_None():
    with pytest.raises(
        ValueError, match="Cannot search for None value! Please pass a valid string."
    ):
        bt.CellType.search(None)


================================================
FILE: tests/core/test_settings.py
================================================
import lamindb as ln
import pytest


def test_settings_repr():
    repr_str = repr(ln.settings)

    lines = repr_str.split("\n")
    assert "Settings" in lines[0]
    assert all(line.startswith("  ") for line in lines[1:])

    content = "\n".join(lines[1:])
    assert content.find("instance:") < content.find("storage:")
    assert content.find("storage:") < content.find("verbosity:")
    assert content.find("verbosity:") < content.find("track_run_inputs:")


def test_storage_setter_raises_on_foreign_managed_storage(tmp_path):
    storage = ln.Storage(root=(tmp_path / "foreign-managed-storage").as_posix()).save()
    storage.instance_uid = "_not_exists_"
    storage.save()

    with pytest.raises(ValueError) as error:
        ln.settings.storage = storage.root
    assert (
        error.exconly()
        == f"ValueError: Storage '{storage.root}' exists in another instance (_not_exists_), cannot write to it from here."
    )
    storage.delete()


def test_local_storage_setter_raises_on_foreign_managed_storage(tmp_path):
    storage = ln.Storage(
        root=(tmp_path / "foreign-managed-local-storage").as_posix()
    ).save()
    storage.instance_uid = "_not_exists_"
    storage.save()

    with pytest.raises(ValueError) as error:
        ln.settings.local_storage = storage.root
    assert (
        error.exconly()
        == f"ValueError: Storage '{storage.root}' exists in another instance (_not_exists_), cannot write to it from here."
    )
    storage.delete()


================================================
FILE: tests/core/test_sqlrecord.py
================================================
import re
import shutil
import textwrap
from pathlib import Path

import bionty as bt
import lamindb as ln
import pandas as pd
import pytest
from lamindb.errors import FieldValidationError
from lamindb.models.sqlrecord import (
    _get_record_kwargs,
    _search,
    get_name_field,
    suggest_records_with_similar_names,
)


def test_feature_describe():
    description = textwrap.dedent("""\
    Feature
      Simple fields
        .uid: CharField
        .name: CharField
        .unit: CharField
        .description: TextField
        .array_rank: SmallIntegerField
        .array_size: IntegerField
        .array_shape: JSONField
        .synonyms: TextField
        .default_value: JSONField
        .nullable: BooleanField
        .coerce: BooleanField
        .is_type: BooleanField
        .is_locked: BooleanField
        .created_at: DateTimeField
        .updated_at: DateTimeField
      Relational fields
        .branch: Branch
        .created_on: Branch
        .space: Space
        .created_by: User
        .run: Run
        .type: Feature
        .schemas: Schema
        .features: Feature
        .values: JsonValue
        .projects: Project
        .ablocks: FeatureBlock
    """).strip()
    assert description == ln.Feature.describe(return_str=True)


def test_artifact_describe():
    description = textwrap.dedent("""\
    Artifact
      Simple fields
        .uid: CharField
        .key: CharField
        .description: TextField
        .suffix: CharField
        .kind: CharField
        .otype: CharField
        .size: BigIntegerField
        .hash: CharField
        .n_files: BigIntegerField
        .n_observations: BigIntegerField
        .version_tag: CharField
        .is_latest: BooleanField
        .is_locked: BooleanField
        .created_at: DateTimeField
        .updated_at: DateTimeField
      Relational fields
        .branch: Branch
        .created_on: Branch
        .space: Space
        .storage: Storage
        .run: Run
        .schema: Schema
        .created_by: User
        .input_of_runs: Run
        .recreating_runs: Run
        .schemas: Schema
        .json_values: JsonValue
        .artifacts: Artifact
        .linked_in_records: Record
        .users: User
        .runs: Run
        .linked_by_runs: Run
        .ulabels: ULabel
        .linked_by_artifacts: Artifact
        .collections: Collection
        .records: Record
        .references: Reference
        .projects: Project
        .ablocks: ArtifactBlock
      Bionty fields
        .organisms: bionty.Organism
        .genes: bionty.Gene
        .proteins: bionty.Protein
        .cell_markers: bionty.CellMarker
        .tissues: bionty.Tissue
        .cell_types: bionty.CellType
        .diseases: bionty.Disease
        .cell_lines: bionty.CellLine
        .phenotypes: bionty.Phenotype
        .pathways: bionty.Pathway
        .experimental_factors: bionty.ExperimentalFactor
        .developmental_stages: bionty.DevelopmentalStage
        .ethnicities: bionty.Ethnicity
    """).strip()
    assert description == ln.Artifact.describe(return_str=True)


def test_repr_describe():
    user = ln.User.filter().first()
    assert user.__repr__().startswith("User")
    assert user.describe(return_str=True).startswith("User")


def test_record_describe_includes_features():
    record = ln.Record(name="describe record").save()
    feature = ln.Feature(name="describe_metric", dtype=float).save()
    record.features.add_values({"describe_metric": 1.23})

    output = record.describe(return_str=True)
    assert "Features" in output
    assert "describe_metric" in output
    assert "1.23" in output

    record.delete(permanent=True)
    feature.delete(permanent=True)


def test_validate_literal_fields():
    # validate literal
    with pytest.raises(FieldValidationError):
        ln.Transform(key="new-name-not-existing-123", kind="invalid")


def test_init_with_args():
    with pytest.raises(
        FieldValidationError,
        match=re.escape(
            "Use keyword arguments instead of positional arguments, e.g.: User(name='...')"
        )
        + r".*",
    ):
        # can't use Record here because it raises "Only one non-keyword arg allowed"
        ln.User("an arg")


def test_validate_required_fields():
    # ULabel has a required name
    with pytest.raises(FieldValidationError):
        ln.ULabel()
    # ULabel has a required name
    with pytest.raises(FieldValidationError):
        ln.ULabel(description="test")


@pytest.fixture
def get_search_test_filepaths():
    Path("unregistered_storage/").mkdir(exist_ok=True)
    filepaths = [Path(f"./unregistered_storage/test-search{i}.txt") for i in range(6)]
    for filepath in filepaths:
        filepath.write_text(filepath.name)
    yield None
    shutil.rmtree("unregistered_storage/")


def test_search_and_get(get_search_test_filepaths):
    artifact1 = ln.Artifact(
        "./unregistered_storage/test-search1.txt", description="nonsense"
    )
    artifact1.save()
    artifact2 = ln.Artifact(
        "./unregistered_storage/test-search2.txt", description="nonsense"
    )
    artifact2.save()

    # on purpose to be search3 to test duplicated search
    artifact0 = ln.Artifact(
        "./unregistered_storage/test-search0.txt", description="test-search3"
    )
    artifact0.save()
    artifact3 = ln.Artifact(
        "./unregistered_storage/test-search3.txt", description="test-search3"
    )
    artifact3.save()
    artifact4 = ln.Artifact(
        "./unregistered_storage/test-search4.txt", description="test-search4"
    )
    artifact4.save()

    result = ln.Artifact.search("search3").to_dataframe()
    assert result.iloc[0].description == "test-search3"
    assert result.iloc[1].description == "test-search3"

    # no returning entries if all search results have __ratio__ 0
    # need a better search string below
    # assert ln.Artifact.search("x").shape[0] == 0

    artifact5 = ln.Artifact(
        "./unregistered_storage/test-search5.txt", key="test-search5.txt"
    )
    artifact5.save()
    res = ln.Artifact.search("search5").to_dataframe()
    assert res.iloc[0].key == "test-search5.txt"

    res_q = ln.Artifact.search("search5")
    assert res_q[0].key == "test-search5.txt"
    # queryset returns the same order of results
    assert res.uid.tolist() == [i.uid for i in res_q]

    # multi-field search
    res = ln.Artifact.search(
        "txt", field=["key", "description", "suffix"]
    ).to_dataframe()
    assert res.iloc[0].suffix == ".txt"

    # get

    artifact = ln.Artifact.get(description="test-search4")
    assert artifact == artifact4

    with pytest.raises(ln.errors.ObjectDoesNotExist):
        ln.Artifact.get(description="test-does-not-exist")

    artifact0.delete(permanent=True, storage=True)
    artifact1.delete(permanent=True, storage=True)
    artifact2.delete(permanent=True, storage=True)
    artifact3.delete(permanent=True, storage=True)
    artifact4.delete(permanent=True, storage=True)
    artifact5.delete(permanent=True, storage=True)


def test_suggest_similar_names():
    record1 = ln.Record(name="Test experiment 1").save()
    record2 = ln.Record(name="Test experiment 2").save()
    record3 = ln.Record(name="Special test experiment abc").save()
    record4 = ln.Record(name="A very special test experiment abc").save()

    assert ln.Record(name="Test experiment 1").uid == record1.uid

    assert suggest_records_with_similar_names(
        record1, "name", {"name": "Test experiment 1"}
    )
    assert not suggest_records_with_similar_names(
        record2, "name", {"name": "Test experiment 123"}
    )

    queryset = _search(
        ln.Record,
        "Test experiment 123",
        field="name",
        truncate_string=True,
        limit=3,
    )
    assert queryset.count() == 3

    queryset = _search(
        ln.Record,
        "Special test experiment abc",
        field="name",
        truncate_string=True,
        limit=3,
    )
    assert queryset.count() == 2
    assert queryset[0].name == "Special test experiment abc"

    record1.delete(permanent=True)
    record2.delete(permanent=True)
    record3.delete(permanent=True)
    record4.delete(permanent=True)


def test_pass_version():
    # creating a new transform on key retrieves the same transform
    # for as long as no source_code was saved
    transform = ln.Transform(key="mytransform", version="1").save()
    assert transform.version_tag == "1"
    assert transform.version == "1"
    assert ln.Transform(key="mytransform", version="1") == transform
    # in case source code is saved
    transform.source_code = "dummy"
    transform.save()
    with pytest.raises(ValueError) as e:
        ln.Transform(key="mytransform", version="1")
    assert (
        e.exconly()
        == "ValueError: Please change the version tag or leave it `None`, '1' is already taken"
    )


def test_delete():
    record = ln.Record(name="test-delete")
    # record not yet saved, delete has no effect
    result = record.delete()
    assert result is None
    assert record.branch_id == 1
    record.save()
    result = record.delete()
    assert result is None
    assert record.branch_id == -1
    result = record.delete(permanent=True)
    assert isinstance(result, tuple)
    assert len(result) == 2
    deleted_count, deleted_dict = result
    assert deleted_count == 1
    assert isinstance(deleted_dict, dict)
    assert ln.Record.filter(name="test-delete").exists() is False


def test_get_name_field():
    transform = ln.Transform(key="test").save()
    assert get_name_field(ln.Run(transform)) == "started_at"
    with pytest.raises(ValueError):
        get_name_field(ln.Artifact.records.through())
    transform.delete(permanent=True)


def test_using():
    # the two below calls error if the records aren't found
    ln.Artifact.connect("laminlabs/lamin-site-assets").get(1)
    ln.Artifact.connect("laminlabs/lamin-site-assets").get(uid="MqEaGU7fXvxNy61R0000")
    # cross-database query
    hemangioblast = bt.CellType.from_source(name="hemangioblast").save()
    artifact = (
        ln.Artifact.connect("laminlabs/lamin-dev")
        .filter(cell_types=hemangioblast)
        .first()
    )
    assert artifact is not None
    hemangioblast_dev = artifact.cell_types.get(name="hemangioblast")
    assert hemangioblast_dev.uid == hemangioblast.uid
    assert hemangioblast_dev.id != hemangioblast.id
    # query via list
    artifact_ref = (
        ln.Artifact.connect("laminlabs/lamin-dev")
        .filter(cell_types__in=[hemangioblast])
        .first()
    )
    assert artifact == artifact_ref
    # check that .using provided with the current intance does nothing
    assert ln.User.connect("lamindb-unit-tests-core").first()._state.db == "default"
    user = ln.setup.settings.user.handle
    assert (
        ln.User.connect(f"{user}/lamindb-unit-tests-core").first()._state.db
        == "default"
    )


def test_get_record_kwargs():
    assert _get_record_kwargs(ln.Feature) == [
        ("name", "str"),
        ("dtype", "DtypeStr | ULabel | Record | Registry | list[Registry] | FieldAttr"),
        ("type", "Feature | None"),
        ("is_type", "bool"),
        ("unit", "str | None"),
        ("description", "str | None"),
        ("synonyms", "str | None"),
        ("nullable", "bool | None"),
        (
            "default_value",
            "Any | None",
        ),
        ("coerce", "bool | None"),
        (
            "cat_filters",
            "dict[str",
        ),
    ]


def test_get_record_kwargs_empty():
    class EmptySQLRecord:
        pass

    assert _get_record_kwargs(EmptySQLRecord) == []

    class NoInitSQLRecord:
        def method(self):
            pass

    assert _get_record_kwargs(NoInitSQLRecord) == []


def test_soft_delete_error():
    with pytest.raises(ValueError):
        ln.Storage.filter().first().delete(permanent=False)

    with pytest.raises(ValueError):
        ln.Branch.filter().first().delete(permanent=False)


def test_delete_return_value_permanent():
    """Test that permanent delete returns Django's natural return value."""
    # Test with ULabel (simple SQLRecord)
    ulabel = ln.ULabel(name="test-delete-return").save()
    result = ulabel.delete(permanent=True)
    assert isinstance(result, tuple)
    assert len(result) == 2
    deleted_count, deleted_dict = result
    assert deleted_count == 1
    assert isinstance(deleted_dict, dict)
    assert len(deleted_dict) > 0
    # Check that the registry name is in the dict
    # Django returns app_label.ClassName format
    registry_name = f"{ulabel._meta.app_label}.{ulabel.__class__.__name__}"
    assert registry_name in deleted_dict
    assert deleted_dict[registry_name] == 1


def test_unsaved_relationship_modification_attempts():
    af = ln.Artifact.from_dataframe(
        pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}), description="testme"
    )

    new_label = ln.Record(name="testlabel").save()
    with pytest.raises(ValueError) as excinfo:
        af.records.add(new_label)

    assert (
        str(excinfo.value)
        == "You are trying to access the many-to-many relationships of an unsaved Artifact object. Please save it first using '.save()'."
    )

    new_label.delete(permanent=True)
    af.delete(permanent=True)


def test_failed_connect():
    with pytest.raises(ln.setup.errors.InstanceNotFoundError) as error:
        ln.Artifact.connect("laminlabs/lamindata-not-existing")
    assert error.exconly().startswith(
        "lamindb_setup.errors.InstanceNotFoundError: 'laminlabs/lamindata-not-existing' not found: 'instance-not-found'"
    )


def test_unsaved_model_different_instance():
    af = ln.Artifact.connect("laminlabs/lamindata").get(
        key="scrna/micro-macfarland2020.h5ad"
    )

    new_label = ln.Record(name="testlabel").save()
    with pytest.raises(ValueError) as excinfo:
        af.records.add(new_label)

    assert (
        str(excinfo.value)
        == "Cannot label a record from instance 'laminlabs/lamindata'. "
        "Please save the record first to your instance using '.save()'."
    )

    new_label.delete(permanent=True)


def test_track_fields_with_deferred_columns(example_dataframe: pd.DataFrame):
    artifact = ln.Artifact.from_dataframe(
        example_dataframe, key="deferred-track-fields.parquet"
    ).save()

    # loading a tracked field as deferred should not crash in __init__
    deferred_artifact = ln.Artifact.filter(id=artifact.id).only("id").one()
    assert deferred_artifact.id == artifact.id
    assert not deferred_artifact._field_changed("space_id")

    artifact.delete(permanent=True)


def test_track_fields_must_exist_on_model(monkeypatch, example_dataframe: pd.DataFrame):
    artifact = ln.Artifact.from_dataframe(
        example_dataframe, key="invalid-track-field.parquet"
    ).save()

    monkeypatch.setattr(ln.Artifact, "_TRACK_FIELDS", ("space_id", "not_a_real_field"))

    with pytest.raises(
        FieldValidationError,
        match="_TRACK_FIELDS contains invalid field for Artifact: not_a_real_field",
    ):
        ln.Artifact.get(artifact.id)

    artifact.delete(permanent=True)


================================================
FILE: tests/core/test_storage.py
================================================
import concurrent.futures

import lamindb as ln


# we need this test both in the core and the storage/cloud tests
# because the internal logic that retrieves information about other instances
# depends on whether the current instance is managed on the hub
def test_reference_storage_location(ccaplog):
    ln.Artifact("s3://lamindata/iris_studies/study0_raw_images")
    assert ln.Storage.get(root="s3://lamindata").instance_uid == "4XIuR0tvaiXM"
    assert (
        "referenced read-only storage location at s3://lamindata, is managed by instance with uid 4XIuR0tvaiXM"
        in ccaplog.text
    )


def test_create_storage_locations_parallel():
    root: str = "nonregistered_storage"

    def create_storage() -> str:
        ln.Storage(root=root).save()  # type: ignore
        return root

    n_parallel = 3
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_parallel) as executor:
        futures = [executor.submit(create_storage) for i in range(n_parallel)]
        _ = [future.result() for future in concurrent.futures.as_completed(futures)]

    storage = ln.Storage.get(root__endswith=root)
    storage.delete()


================================================
FILE: tests/core/test_switch.py
================================================
"""Tests for ln.setup.switch."""

import lamindb as ln
import pytest


def test_switch_create_existing_branch_raises():
    """Switch with create=True and existing branch raises BranchAlreadyExists with hint."""
    with pytest.raises(ln.errors.BranchAlreadyExists) as exc_info:
        ln.setup.switch("main", create=True)
    msg = str(exc_info.value)
    assert "already exists" in msg
    assert "-c/--create" in msg or "Omit" in msg


================================================
FILE: tests/core/test_track_flow.py
================================================
import time
from pathlib import Path
from typing import Iterable

import lamindb as ln
import pandas as pd
import pytest
from lamindb.errors import InvalidArgument


@ln.flow(global_run="clear")
def process_chunk(
    chunk_id: int, artifact_param: ln.Artifact, records_params: Iterable[ln.Record]
) -> str:
    # Create a simple DataFrame
    df = pd.DataFrame(
        {"id": range(chunk_id * 10, (chunk_id + 1) * 10), "value": range(10)}
    )
    env_file = Path("file_with_same_hash.txt")
    env_file.write_text("1")
    ln.Artifact(env_file, description="file_with_same_hash").save()
    # Save it as an artifact
    key = f"chunk_{chunk_id}.parquet"
    artifact = ln.Artifact.from_dataframe(df, key=key).save()
    assert ln.context.run is not None
    return artifact.key


def test_flow():
    param_artifact = ln.Artifact(".gitignore", key="param_artifact").save()
    ln.Record(name="record1").save(), ln.Record(name="record2").save()
    records_params = ln.Record.filter(name__startswith="record")

    assert ln.context.run is None
    artifact_key = process_chunk(1, param_artifact, records_params)
    assert ln.context.run is None

    # Verify the artifacts and runs
    artifacts = [ln.Artifact.get(key=key) for key in [artifact_key]]
    same_hash_artifacts = ln.Artifact.filter(description="file_with_same_hash")

    runs = [artifact.run for artifact in artifacts]

    # Verify each run has the correct start and finish times
    for run in runs:
        print(f"Run details: {run}")
        assert run.started_at is not None
        assert run.finished_at is not None
        assert run.started_at < run.finished_at
        assert run.status == "completed"
        assert isinstance(run.params["chunk_id"], int)
        assert run.params["artifact_param"].startswith(
            f"Artifact[{param_artifact.uid}]"
        )
        assert run.params["records_params"] == [
            f"Record[{record.uid}]" for record in records_params
        ]

    # test error behavior
    with pytest.raises(RuntimeError) as error:
        ln.context._run = run
        process_chunk(1, param_artifact, records_params)
        ln.context._run = None
    assert str(error.exconly()).startswith(
        "RuntimeError: Please use @ln.step() or clear the global run context before using @ln.flow(): no `ln.track()` or `@ln.flow(global_run='clear')`"
    )

    # Clean up test artifacts
    runs = []
    for artifact in artifacts:
        runs.append(artifact.run)
        artifact.delete(permanent=True)
    param_artifact.delete(permanent=True)
    same_hash_artifacts[0].delete(permanent=True)
    Path("file_with_same_hash.txt").unlink()
    for run in runs:
        run.delete(permanent=True)
    ln.context._run = None


def test_flow_track_arg_aliases_implicit():
    unique = time.time_ns()
    missing_project = f"missing-flow-project-{unique}"

    @ln.flow(global_run="clear")
    def flow_with_implicit_project_alias(project: str) -> None:
        pass

    with pytest.raises(InvalidArgument) as error:
        flow_with_implicit_project_alias(project=missing_project)
    assert error.exconly().startswith(
        f"lamindb.errors.InvalidArgument: Project '{missing_project}' not found"
    )


def test_flow_track_arg_aliases_false():
    unique = time.time_ns()
    missing_project = f"missing-flow-project-{unique}"

    @ln.flow(global_run="clear", track_arg_aliases=False)
    def flow_without_project_alias(project: str) -> str:
        assert ln.context.run is not None
        return ln.context.run.uid

    run = None
    try:
        run_uid = flow_without_project_alias(project=missing_project)
        run = ln.Run.get(uid=run_uid)
        assert run.params["project"] == missing_project
    finally:
        ln.context._run = None
        if run is not None:
            run.delete(permanent=True)
            run.transform.delete(permanent=True)


================================================
FILE: tests/core/test_track_script_or_notebook.py
================================================
import signal
import subprocess
import sys
import time
from pathlib import Path
from unittest.mock import MagicMock, patch

import lamindb as ln
import lamindb_setup as ln_setup
import pytest
from lamindb._finish import clean_r_notebook_html, get_shortcut
from lamindb._secret_redaction import redact_secrets_in_source_code
from lamindb.core._context import (
    REDACTED_SECRET_VALUE,
    LogStreamTracker,
    context,
    detect_and_process_source_code_file,
    serialize_params_to_json,
)
from lamindb.errors import InvalidArgument, TrackNotCalled, ValidationError
from lamindb_setup.core.upath import UPath

SCRIPTS_DIR = Path(__file__).parent.resolve() / "scripts"
NOTEBOOKS_DIR = Path(__file__).parent.resolve() / "notebooks"


def test_serialize_params_to_json():
    a_path = Path("/some/local/folder")
    a_upath = UPath("s3://bucket/key")
    params = {
        "path_key": a_path,
        "none_key": None,
        "empty_list_key": [],
        "list_str_key": ["string"],
        "upath_key": a_upath,
        "str_key": "plain",
        "api_key": "test-api-key-value",
        "openAIApiKey": "another-secret",
        "database_url": "postgresql://db_user:db_password@db.example.com:5432/mydb",
    }
    result = serialize_params_to_json(params)
    # None is omitted
    assert "none_key" not in result
    # Empty list is omitted (same as None)
    assert "empty_list_key" not in result
    # Path is serialized to posix string
    assert result["path_key"] == "/some/local/folder"
    # UPath is serialized to posix string
    assert result["upath_key"] == "s3://bucket/key"
    # List of strings is JSON-serialized as-is (list[cat ? str])
    assert result["list_str_key"] == ["string"]
    # Other values unchanged
    assert result["str_key"] == "plain"
    assert result["api_key"] == REDACTED_SECRET_VALUE
    assert result["openAIApiKey"] == REDACTED_SECRET_VALUE
    assert result["database_url"] == REDACTED_SECRET_VALUE
    assert set(result.keys()) == {
        "path_key",
        "upath_key",
        "str_key",
        "list_str_key",
        "api_key",
        "openAIApiKey",
        "database_url",
    }


def test_redact_secrets_in_source_code():
    source_code = """
api_key = "test-api-key-value"
openAIApiKey = "another-secret"
uid = "a6yhtobqTjQM6q8t"
db_url = "postgresql://db_user:db_password@db.example.com:5432/mydb"
os.environ["API_KEY"] = "sdk-key"
config = {"client_secret": "client-secret-value", "id": "abc123"}
"""
    redacted, redaction_count = redact_secrets_in_source_code(source_code)
    assert redaction_count == 5
    assert 'api_key = "***REDACTED***"' in redacted
    assert 'openAIApiKey = "***REDACTED***"' in redacted
    assert 'db_url = "***REDACTED***"' in redacted
    assert 'os.environ["API_KEY"] = "***REDACTED***"' in redacted
    assert '"client_secret": "***REDACTED***"' in redacted
    assert 'uid = "a6yhtobqTjQM6q8t"' in redacted


def test_redact_secrets_in_source_code_keeps_env_references():
    source_code = """
api_key = os.getenv("OPENAI_API_KEY")
openAIApiKey = getenv("OPENAI_API_KEY")
model_api_key = os.environ["MODEL_API_KEY"]
provider_token = os.environ.get("PROVIDER_TOKEN")
"""
    redacted, redaction_count = redact_secrets_in_source_code(source_code)
    # Env lookups are references, not embedded literals. Keep them for rerunnable source code.
    assert redaction_count == 0
    assert 'api_key = os.getenv("OPENAI_API_KEY")' in redacted
    assert 'openAIApiKey = getenv("OPENAI_API_KEY")' in redacted
    assert 'model_api_key = os.environ["MODEL_API_KEY"]' in redacted
    assert 'provider_token = os.environ.get("PROVIDER_TOKEN")' in redacted


def test_redact_secrets_in_source_code_ignores_annotations_and_forwarding():
    source_code = """
def run(api_key: str) -> None:
    raise RuntimeError("fail")

run_agent(
    api_key=api_key,
)
"""
    redacted, redaction_count = redact_secrets_in_source_code(source_code)
    # Do not treat Python type annotations or argument forwarding as hardcoded secrets.
    assert redaction_count == 0
    assert "def run(api_key: str) -> None:" in redacted
    assert "api_key=api_key," in redacted


def test_serialize_params_to_json_redacts_provider_api_key_names():
    params = {
        "LAMIN_API_KEY": "lamin-super-secret",
        "OPENAI_API_KEY": "openai-super-secret",
        "ANTHROPIC_API_KEY": "anthropic-super-secret",
        "GEMINI_API_KEY": "gemini-super-secret",
        "provider_name": "safe-value",
    }
    result = serialize_params_to_json(params)
    assert result["LAMIN_API_KEY"] == REDACTED_SECRET_VALUE
    assert result["OPENAI_API_KEY"] == REDACTED_SECRET_VALUE
    assert result["ANTHROPIC_API_KEY"] == REDACTED_SECRET_VALUE
    assert result["GEMINI_API_KEY"] == REDACTED_SECRET_VALUE
    assert result["provider_name"] == "safe-value"


def test_redact_secrets_in_source_code_redacts_provider_api_key_names():
    source_code = """
LAMIN_API_KEY = "lamin-super-secret"
OPENAI_API_KEY = "openai-super-secret"
ANTHROPIC_API_KEY = "anthropic-super-secret"
GEMINI_API_KEY = "gemini-super-secret"
provider = "openai"
"""
    redacted, redaction_count = redact_secrets_in_source_code(source_code)
    assert redaction_count == 4
    assert 'LAMIN_API_KEY = "***REDACTED***"' in redacted
    assert 'OPENAI_API_KEY = "***REDACTED***"' in redacted
    assert 'ANTHROPIC_API_KEY = "***REDACTED***"' in redacted
    assert 'GEMINI_API_KEY = "***REDACTED***"' in redacted
    assert 'provider = "openai"' in redacted


def test_track_basic_invocation():
    project = "non-existing project"
    with pytest.raises(ln.errors.InvalidArgument) as error:
        ln.track(project=project)
    assert (
        error.exconly()
        == f"lamindb.errors.InvalidArgument: Project '{project}' not found, either create it with `ln.Project(name='...').save()` or fix typos."
    )
    space = "non-existing space"
    with pytest.raises(ln.errors.InvalidArgument) as error:
        ln.track(space=space)
    assert (
        error.exconly()
        == f"lamindb.errors.InvalidArgument: Space '{space}', please check on the hub UI whether you have the correct `uid` or `name`."
    )

    test_transform = ln.Transform(key="test_transform").save()

    # first invocation using features
    kwargs = {"param1": 1, "param2": "my-string", "param3": 3.14}
    with pytest.raises(ValidationError) as exc:
        ln.track(transform=test_transform, features=kwargs)
    assert exc.exconly().startswith(
        """lamindb.errors.ValidationError: These keys could not be validated: ['param1', 'param2', 'param3']"""
    )
    feature1 = ln.Feature(name="param1", dtype=int).save()
    feature2 = ln.Feature(name="param2", dtype=str).save()
    feature3 = ln.Feature(name="param3", dtype=float).save()
    feature4 = ln.Feature(name="label_param", dtype=ln.Record).save()
    record = ln.Record(name="my_label").save()
    kwargs["label_param"] = "my_label"
    ln.track(transform=test_transform, features=kwargs)
    assert ln.context.run.features.get_values() == kwargs
    print(ln.context.run.features.describe(return_str=True))
    assert (
        ln.context.run.features.describe(return_str=True)
        == f"""\
Run: {ln.context.run.uid[:7]} ({ln.context.run.transform.key})
└── Features
    └── label_param         Record                   my_label
        param1              int                      1
        param2              str                      my-string
        param3              float                    3.14"""
    )
    # also call describe() plainly without further checks
    ln.context.run.describe()
    # second invocation
    kwargs = {"param1": 1, "param2": "my-string", "param3": 3.14, "param4": [1, 2]}
    param4 = ln.Feature(name="param4", dtype="int").save()
    with pytest.raises(ValidationError) as exc:
        ln.track(transform=test_transform, features=kwargs)
    assert "Column 'param4' failed dtype check for 'int': got object" in exc.exconly()
    # fix param4 dtype
    param4.delete(permanent=True)
    param4 = ln.Feature(name="param4", dtype=list[int]).save()
    # re-run
    ln.track(transform=test_transform, features=kwargs)
    assert ln.context.run.features.get_values() == kwargs

    # now use the params arg
    ln.track(transform=test_transform, params=kwargs)
    assert ln.context.run.params == kwargs
    assert ln.Run.filter(params__param1=kwargs["param1"]).count() == 1

    # test that run populates things like records
    record = ln.Record(name="my-label-in-track")
    assert record.run == ln.context.run

    # test that we can call ln.finish() also for pipeline-like transforms
    run = ln.context.run
    assert run.finished_at is None
    ln.finish()
    assert (
        run.finished_at is not None
    )  # context is cleared after finish(); use captured run

    # clean up
    run.delete(permanent=True)
    ln.models.RunJsonValue.filter(run__transform=test_transform).delete(permanent=True)
    ln.models.RunRecord.filter(run__transform=test_transform).delete(permanent=True)
    feature1.delete(permanent=True)
    feature2.delete(permanent=True)
    feature3.delete(permanent=True)
    feature4.delete(permanent=True)
    param4.delete(permanent=True)
    test_transform.delete(permanent=True)


def test_track_accepts_initiated_by_run_uid():
    unique = time.time_ns()
    parent_transform = ln.Transform(key=f"parent-run-{unique}").save()
    child_transform = ln.Transform(key=f"child-run-{unique}").save()
    parent_run = ln.Run(transform=parent_transform).save()
    try:
        ln.track(
            transform=child_transform,
            initiated_by_run=parent_run.uid,
            new_run=True,
        )
        assert ln.context.run is not None
        assert ln.context.run.initiated_by_run is not None
        assert ln.context.run.initiated_by_run.uid == parent_run.uid
        ln.finish()
        with pytest.raises(InvalidArgument) as error:
            ln.track(
                transform=child_transform,
                initiated_by_run="does-not-exist",
                new_run=True,
            )
        assert error.exconly().startswith(
            "lamindb.errors.InvalidArgument: Run 'does-not-exist' not found"
        )
    finally:
        ln.context._run = None
        ln.Run.filter(transform=child_transform).delete(permanent=True)
        parent_run.delete(permanent=True)
        child_transform.delete(permanent=True)
        parent_transform.delete(permanent=True)


def test_track_uses_initiated_by_run_uid_from_env(monkeypatch: pytest.MonkeyPatch):
    unique = time.time_ns()
    parent_transform = ln.Transform(key=f"parent-run-env-{unique}").save()
    child_transform = ln.Transform(key=f"child-run-env-{unique}").save()
    parent_run = ln.Run(transform=parent_transform).save()
    try:
        monkeypatch.setenv("LAMIN_INITIATED_BY_RUN_UID", parent_run.uid)
        ln.track(transform=child_transform, new_run=True)
        assert ln.context.run is not None
        assert ln.context.run.initiated_by_run is not None
        assert ln.context.run.initiated_by_run.uid == parent_run.uid
        ln.finish()
    finally:
        ln.context._run = None
        ln.Run.filter(transform=child_transform).delete(permanent=True)
        parent_run.delete(permanent=True)
        child_transform.delete(permanent=True)
        parent_transform.delete(permanent=True)


@pytest.mark.parametrize("pass_plan_as_key", [False, True], ids=["artifact", "key"])
def test_track_with_plan_links_run(tmp_path, pass_plan_as_key):
    unique = time.time_ns()
    plan_path = tmp_path / f"my-agent-plan-{unique}.md"
    plan_path.write_text("# Agent plan\n\n- Step 1\n")
    plan_artifact = ln.Artifact(
        plan_path,
        key=f".plans/my-agent-plan-{unique}.md",
        kind="plan",
    ).save()
    transform = ln.Transform(key=f"test-track-with-plan-{unique}").save()
    try:
        plan = plan_artifact.key if pass_plan_as_key else plan_artifact
        ln.track(transform=transform, plan=plan)
        run = ln.context.run
        assert run.plan is not None
        assert run.plan.uid == plan_artifact.uid
        run_from_db = ln.Run.get(uid=run.uid)
        assert run_from_db.plan is not None
        assert run_from_db.plan.uid == plan_artifact.uid
        ln.finish()
    finally:
        ln.context._run = None
        ln.Run.filter(transform=transform).delete(permanent=True)
        plan_artifact.delete(permanent=True)
        transform.delete(permanent=True)


@pytest.fixture
def create_record():
    """Factory fixture that returns a function to create records."""
    created_records = []

    def create(kind: str) -> ln.models.SQLRecord:
        if kind == "artifact":
            record = ln.Artifact("README.md", key="README.md").save()
        elif kind == "collection":
            a1 = ln.Artifact("README.md", key="README.md").save()
            created_records.append(a1)
            a2 = ln.Artifact("pyproject.toml", key="pyproject.toml").save()
            created_records.append(a2)
            record = ln.Collection([a1, a2], key="test-collection").save()
        created_records.append(record)
        return record

    yield create

    for record in created_records[::-1]:
        record.delete(permanent=True)


@pytest.mark.parametrize("kind", ["artifact", "collection"])
def test_track_input_record(create_record, kind):
    # First run
    ln.track()
    previous_run = ln.context.run
    record = create_record(kind)
    record.cache()
    assert (
        record not in getattr(ln.context.run, f"input_{kind}s").all()
    )  # avoid cycle with created artifact

    # Second run
    ln.track(new_run=True)
    assert ln.context.run != previous_run
    record = create_record(kind)
    assert ln.context.run in record.recreating_runs.all()
    assert record._subsequent_run_id == ln.context.run.id
    record.cache()
    assert (
        record not in getattr(ln.context.run, f"input_{kind}s").all()
    )  # avoid cycle with re-created artifact

    # Third run
    ln.track(new_run=True)
    assert ln.context.run != previous_run
    if kind == "artifact":
        record = ln.Artifact.get(key="README.md")
    else:
        record = ln.Collection.get(key="test-collection")
    record.cache()
    assert ln.context.run not in record.recreating_runs.all()
    assert not hasattr(record, "_subsequent_run_id")
    assert record in getattr(ln.context.run, f"input_{kind}s").all()  # regular input


def test_track_notebook_colab():
    notebook_path = "/fileId=1KskciVXleoTeS_OGoJasXZJreDU9La_l"
    ln.context._track_notebook(path_str=notebook_path)


def test_track_notebook_untitled():
    notebook_path = "Untitled.ipynb"
    with pytest.raises(RuntimeError) as error:
        ln.context._track_notebook(path_str=notebook_path)
    assert (
        "Your notebook file name is 'Untitled.ipynb', please rename it before tracking. You might have to re-start your notebook kernel."
        in error.exconly()
    )


def test_detect_and_process_source_code_file_returns_key_from_module_for_package():
    """When path is inferred from stack and caller __name__ has '.', key_from_module is module path."""
    script_path = str(SCRIPTS_DIR / "script-to-test-versioning.py")
    mock_frame = MagicMock()
    mock_frame.f_globals = {"__name__": "mypackage.mymodule"}
    with patch("inspect.stack") as mock_stack:
        mock_stack.return_value = [
            MagicMock(),
            MagicMock(),
            (
                mock_frame,
                script_path,
                MagicMock(),
                MagicMock(),
                MagicMock(),
                MagicMock(),
            ),
        ]
        path, kind, ref, ref_type, key_from_module = (
            detect_and_process_source_code_file(path=None)
        )
    assert key_from_module == "pypackages/mypackage/mymodule.py"
    assert path == Path(script_path)


def test_detect_and_process_source_code_file_returns_none_key_for_script():
    """When path is inferred from stack and caller __name__ has no '.', key_from_module is None."""
    script_path = str(SCRIPTS_DIR / "script-to-test-versioning.py")
    mock_frame = MagicMock()
    mock_frame.f_globals = {"__name__": "__main__"}
    with patch("inspect.stack") as mock_stack:
        mock_stack.return_value = [
            MagicMock(),
            MagicMock(),
            (
                mock_frame,
                script_path,
                MagicMock(),
                MagicMock(),
                MagicMock(),
                MagicMock(),
            ),
        ]
        path, kind, ref, ref_type, key_from_module = (
            detect_and_process_source_code_file(path=None)
        )
    assert key_from_module is None


def test_finish_before_track():
    ln.context._run = None
    with pytest.raises(TrackNotCalled) as error:
        ln.finish()
    assert "Please run `ln.track()` before `ln.finish()" in error.exconly()


def test_invalid_transform_kind():
    transform = ln.Transform(key="test transform")
    ln.track(transform=transform)
    ln.context._path = None
    ln.context.run.transform.kind = "script"
    with pytest.raises(ValueError) as error:
        ln.finish()
    assert "Transform type is not allowed to be" in error.exconly()

    # unset to remove side effects
    ln.context._run = None


def test_create_or_load_transform():
    title = "title"
    version = "2.0"
    uid = "NJvdsWWbJlZS0000"
    context.uid = uid
    context.version = version
    context._path = Path("my-test-transform-create-or-load.py")
    context._path.touch(exist_ok=True)
    context._create_or_load_transform(
        description=title,
        transform_kind="notebook",
    )
    assert context._transform.uid == uid
    assert context._transform.version_tag == version
    assert context._transform.description == title
    context._create_or_load_transform(
        description=title,
    )
    assert context._transform.uid == uid
    assert context._transform.version_tag == version
    assert context._transform.description == title

    # now, test an updated transform name
    context._create_or_load_transform(
        description="updated title",
    )
    assert context._transform.uid == uid
    assert context._transform.version_tag == version
    assert context._transform.description == "updated title"

    # unset to remove side effects
    ln.context._uid = None
    ln.context._run = None
    ln.context._transform = None
    ln.context._path.unlink()
    ln.context._path = None


def test_create_or_load_transform_warns_when_outside_dev_dir(
    tmp_path, ccaplog: pytest.LogCaptureFixture
):
    previous_dev_dir = ln_setup.settings.dev_dir
    path_outside_dev_dir = tmp_path / f"outside-{time.time_ns()}.py"
    path_outside_dev_dir.write_text("print('track test')\n")
    expected_key = path_outside_dev_dir.name
    transform: ln.Transform | None = None
    try:
        ln_setup.settings.dev_dir = tmp_path / "configured-dev-dir"
        ln_setup.settings.dev_dir.mkdir(exist_ok=True)
        ccaplog.clear()
        context._path = path_outside_dev_dir
        context._create_or_load_transform(description="outside dev dir warning test")
        transform = context._transform
        assert "falling back to using filename as transform key" in ccaplog.text
        assert transform.key == expected_key
    finally:
        ln_setup.settings.dev_dir = previous_dev_dir
        ln.context._uid = None
        ln.context._run = None
        ln.context._transform = None
        ln.context._path = None
        if transform is not None:
            transform.delete(permanent=True)


def test_run_scripts():
    # regular execution
    result = subprocess.run(  # noqa: S602
        f"python {SCRIPTS_DIR / 'script-to-test-versioning.py --param 42'}",
        shell=True,
        capture_output=True,
    )
    assert result.returncode == 0
    assert "created Transform('Ro1gl7n8YrdH0000'" in result.stdout.decode()
    assert "started new Run(" in result.stdout.decode()
    transform = ln.Transform.get("Ro1gl7n8YrdH0000")
    assert transform.latest_run.cli_args == "--param 42"

    # updated key (filename change)
    result = subprocess.run(  # noqa: S602
        f"python {SCRIPTS_DIR / 'script-to-test-filename-change.py'}",
        shell=True,
        capture_output=True,
    )
    assert result.returncode == 0
    assert "renaming transform" in result.stdout.decode()
    transform = ln.Transform.get(key="script-to-test-filename-change.py")
    assert transform.latest_run.cli_args is None

    # version already taken
    result = subprocess.run(  # noqa: S602
        f"python {SCRIPTS_DIR / 'duplicate1/script-to-test-versioning.py'}",
        shell=True,
        capture_output=True,
    )
    assert result.returncode == 1
    assert (
        "✗ version '1' is already taken by Transform('Ro1gl7n8YrdH0000'); please set another version, e.g., ln.context.version = '1.1'"
        in result.stderr.decode()
    )

    # regular version bump
    result = subprocess.run(  # noqa: S602
        f"python {SCRIPTS_DIR / 'duplicate2/script-to-test-versioning.py'}",
        shell=True,
        capture_output=True,
    )
    assert result.returncode == 0
    assert "created Transform('Ro1gl7n8YrdH0002'" in result.stdout.decode()
    assert "started new Run(" in result.stdout.decode()
    assert not ln.Transform.get("Ro1gl7n8YrdH0001").is_latest
    assert ln.Transform.get("Ro1gl7n8YrdH0002").is_latest

    # inconsistent version
    result = subprocess.run(  # noqa: S602
        f"python {SCRIPTS_DIR / 'duplicate3/script-to-test-versioning.py'}",
        shell=True,
        capture_output=True,
    )
    assert result.returncode == 1
    assert (
        "Transform is already tagged with version 2, but you passed 3"
        in result.stderr.decode()
    )

    # multiple folders, do not match the key because of the folder structure
    ln.Transform.filter(key__endswith="script-to-test-versioning.py").update(
        key="teamA/script-to-test-versioning.py"
    )
    # this test creates a transform with key script-to-test-versioning.py at the root level
    result = subprocess.run(  # noqa: S602
        f"python {SCRIPTS_DIR / 'duplicate4/script-to-test-versioning.py'}",
        shell=True,
        capture_output=True,
    )
    assert result.returncode == 0
    assert "ignoring transform" in result.stdout.decode()

    transform = ln.Transform.get(key="script-to-test-versioning.py")

    # multiple folders, match the key, also test is finished
    result = subprocess.run(  # noqa: S602
        f"python {SCRIPTS_DIR / 'duplicate5/script-to-test-versioning.py'}",
        shell=True,
        capture_output=True,
    )
    assert result.returncode == 0
    assert f"{transform.stem_uid}" in result.stdout.decode()
    assert "making new version" in result.stdout.decode()

    transform = ln.Transform.get(key="script-to-test-versioning.py")
    assert transform.latest_run.finished_at is not None


def test_run_external_script():
    script_path = "sub/lamin-cli/tests/scripts/run-track-and-finish-sync-git.py"
    result = subprocess.run(  # noqa: S602
        f"python {script_path}",
        shell=True,
        capture_output=True,
    )
    print(result.stdout.decode())
    print(result.stderr.decode())
    assert result.returncode == 0
    assert "created Transform" in result.stdout.decode()
    assert "started new Run" in result.stdout.decode()
    transform = ln.Transform.get(key="run-track-and-finish-sync-git.py")
    # the algorithm currently picks different commits depending on the state of the repo
    # any of these commits are valid
    assert transform.uid == "m5uCHTTpJnjQ0000"
    assert transform.reference.endswith(
        "/tests/scripts/run-track-and-finish-sync-git.py"
    )
    assert transform.reference.startswith(
        "https://github.com/laminlabs/lamin-cli/blob/"
    )
    assert transform.reference_type == "url"
    assert transform.description == "My good script"
    # ensure that the source code is not saved as an output artifact
    assert transform.latest_run.output_artifacts.count() == 0
    assert transform.runs.count() == 1
    assert transform.hash == "VC1oTPcaVSrzNrXUT9p4qw"


@pytest.mark.parametrize("type", ["notebook", "script"])
def test_track_notebook_or_script_manually(type):
    transform = ln.Transform(key="My notebook", kind=type)
    with pytest.raises(ValueError) as error:
        ln.track(transform=transform)
    assert (
        error.exconly()
        == "ValueError: Use `ln.track()` without passing transform in a notebook or script - metadata is automatically parsed"
    )


def test_clean_r_notebook_html():
    orig_notebook_path = NOTEBOOKS_DIR / "basic-r-notebook.Rmd.html"
    content = orig_notebook_path.read_text()
    orig_notebook_path.write_text(content.replace("SHORTCUT", get_shortcut()))
    comparison_path = NOTEBOOKS_DIR / "basic-r-notebook.Rmd.cleaned.html"
    compare = comparison_path.read_text()
    comparison_path.unlink()
    title_text, cleaned_path = clean_r_notebook_html(orig_notebook_path)
    assert comparison_path == cleaned_path
    assert title_text == "My exemplary R analysis"
    assert compare == cleaned_path.read_text()  # check that things have been stripped
    comparison_path.write_text(compare)
    orig_notebook_path.write_text(content.replace(get_shortcut(), "SHORTCUT"))


def test_notebook_to_script_notebooknode_metadata(tmp_path):
    """Test that notebook_to_script handles NotebookNode metadata.

    https://github.com/laminlabs/lamindb/issues/3480
    """
    import nbformat
    from lamindb._finish import notebook_to_script

    nb = nbformat.v4.new_notebook()
    nb.metadata["kernelspec"] = nbformat.NotebookNode({"display_name": "python3"})
    notebook_path = tmp_path / "test.ipynb"
    nbformat.write(nb, notebook_path)

    # This would raise RepresenterError without metadata.clear()
    result = notebook_to_script("Test", notebook_path)
    assert result is not None
    assert "NotebookNode" not in result


class MockRun:
    def __init__(self, uid):
        self.uid = uid
        self.report = None
        self.saved = False

    def save(self):
        self.saved = True


def test_logstream_tracker_multiple():
    tracker1 = LogStreamTracker()
    tracker2 = LogStreamTracker()
    tracker3 = LogStreamTracker()

    try:
        # Start trackers one by one and print messages
        print("Initial stdout")

        tracker1.start(MockRun("run1"))
        print("After starting tracker1")

        tracker2.start(MockRun("run2"))
        print("After starting tracker2")

        tracker3.start(MockRun("run3"))
        print("After starting tracker3")

        print("Testing stderr", file=sys.stderr)

        time.sleep(0.1)

        # Clean up in reverse order
        tracker3.finish()
        tracker2.finish()
        tracker1.finish()

        # Verify log contents - each log should only contain messages after its start
        expected_contents = {
            1: [
                "After starting tracker1",
                "After starting tracker2",
                "After starting tracker3",
                "Testing stderr",
            ],
            2: ["After starting tracker2", "After starting tracker3", "Testing stderr"],
            3: ["After starting tracker3", "Testing stderr"],
        }

        for i in range(1, 4):
            log_path = Path(ln_setup.settings.cache_dir / f"run_logs_run{i}.txt")
            with open(log_path) as f:
                content = f.read()
                print(f"\nContents of run{i} log:")
                print(content)
                # Check each expected line is in the content
                for expected_line in expected_contents[i]:
                    assert expected_line in content, (
                        f"Expected '{expected_line}' in log {i}"
                    )

                # Check earlier messages are NOT in the content
                if i > 1:
                    assert "Initial stdout" not in content
                    assert "After starting tracker" + str(i - 1) not in content

    finally:
        # Cleanup
        for i in range(1, 4):
            log_path = Path(ln_setup.settings.cache_dir / f"run_logs_run{i}.txt")
            if log_path.exists():
                log_path.unlink()


def test_logstream_tracker_exception_handling():
    tracker = LogStreamTracker()
    original_excepthook = sys.excepthook
    run = MockRun("error")

    try:
        tracker.start(run)
        print("Before error")

        # Create and capture exception info
        exc_type = ValueError
        exc_value = ValueError("Test error")
        exc_traceback = None
        try:
            raise exc_value
        except ValueError:
            exc_traceback = sys.exc_info()[2]

        # Handle the exception - this will trigger cleanup
        tracker.handle_exception(exc_type, exc_value, exc_traceback)

        # Verify run status
        assert run.saved
        assert run.report is not None

        # Verify the content was written before cleanup
        content = run.report.cache().read_text()
        print("Log contents:", content)
        assert "Before error" in content
        assert "ValueError: Test error" in content
        assert "Traceback" in content

    finally:
        tracker.finish()
        sys.excepthook = original_excepthook
        log_path = Path(ln_setup.settings.cache_dir / f"run_logs_{run.uid}.txt")
        if log_path.exists():
            log_path.unlink()


def test_logstream_tracker_cleanup_sigint_chains_to_keyboard_interrupt():
    tracker = LogStreamTracker()
    run = MockRun("sigint")
    original_excepthook = sys.excepthook

    def raising_sigint_handler(signum, frame):
        raise KeyboardInterrupt

    try:
        with (
            patch(
                "signal.getsignal",
                side_effect=[signal.SIG_DFL, raising_sigint_handler],
            ),
            patch("signal.signal"),
            patch("lamindb._finish.save_run_logs"),
        ):
            tracker.start(run)
            with pytest.raises(KeyboardInterrupt):
                tracker.cleanup(signo=signal.SIGINT, frame=None)
    finally:
        tracker.finish()
        sys.excepthook = original_excepthook
        log_path = Path(ln_setup.settings.cache_dir / f"run_logs_{run.uid}.txt")
        if log_path.exists():
            log_path.unlink()


================================================
FILE: tests/core/test_track_step.py
================================================
import concurrent.futures
from pathlib import Path
from typing import Iterable

import lamindb as ln
import pandas as pd
import pytest


@ln.step()
def process_chunk(
    chunk_id: int, artifact_param: ln.Artifact, records_params: Iterable[ln.Record]
) -> str:
    # Create a simple DataFrame
    df = pd.DataFrame(
        {"id": range(chunk_id * 10, (chunk_id + 1) * 10), "value": range(10)}
    )
    env_file = Path("file_with_same_hash.txt")
    env_file.write_text("1")
    ln.Artifact(env_file, description="file_with_same_hash").save()
    # Save it as an artifact
    key = f"chunk_{chunk_id}.parquet"
    artifact = ln.Artifact.from_dataframe(df, key=key).save()
    return artifact.key


def test_step_parallel():
    # Ensure no global run from a previous test (e.g. test_flow)
    ln.context._run = None
    with pytest.raises(RuntimeError) as err:
        process_chunk(4)
    assert (
        err.exconly()
        == "RuntimeError: Please track the global run context before using @ln.step(): ln.track() or @ln.flow()"
    )

    # Ensure tracking is on
    ln.track()

    # Number of parallel executions
    n_parallel = 3

    param_artifact = ln.Artifact(".gitignore", key="param_artifact").save()
    ln.Record(name="record1").save(), ln.Record(name="record2").save()
    records_params = ln.Record.filter(name__startswith="record")

    # Use ThreadPoolExecutor for parallel execution
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_parallel) as executor:
        # Submit all tasks
        futures = [
            executor.submit(process_chunk, i, param_artifact, records_params)
            for i in range(n_parallel)
        ]
        # Get results as they complete
        chunk_keys = [
            future.result() for future in concurrent.futures.as_completed(futures)
        ]

    # Verify results
    # Each execution should have created its own artifact with unique run
    print(f"Created artifacts with keys: {chunk_keys}")
    artifacts = [ln.Artifact.get(key=key) for key in chunk_keys]
    same_hash_artifacts = ln.Artifact.filter(description="file_with_same_hash")

    # Check that we got the expected number of artifacts
    assert len(artifacts) == n_parallel
    assert (
        len(same_hash_artifacts) == 1
    )  # only one artifact with the same hash should exist

    # Verify each artifact has its own unique run
    runs = [artifact.run for artifact in artifacts]
    run_ids = [run.id for run in runs]
    print(f"Run IDs: {run_ids}")
    assert len(set(run_ids)) == n_parallel  # all runs should be unique

    # Verify each run has the correct start and finish times
    for run in runs:
        print(f"Run details: {run}")
        assert run.started_at is not None
        assert run.finished_at is not None
        assert run.started_at < run.finished_at
        assert run.status == "completed"
        assert isinstance(run.params["chunk_id"], int)
        assert run.params["artifact_param"].startswith(
            f"Artifact[{param_artifact.uid}]"
        )
        assert run.params["records_params"] == [
            f"Record[{record.uid}]" for record in records_params
        ]

    # Clean up test artifacts
    runs = []
    for artifact in artifacts:
        runs.append(artifact.run)
        artifact.delete(permanent=True)
    param_artifact.delete(permanent=True)
    same_hash_artifacts[0].delete(permanent=True)
    Path("file_with_same_hash.txt").unlink()
    for run in runs:
        run.delete(permanent=True)

    ln.context._uid = None
    ln.context._run = None
    ln.context._transform = None
    ln.context._path = None


================================================
FILE: tests/core/test_transform.py
================================================
from pathlib import Path
from unittest.mock import patch

import lamindb as ln
import pytest


def test_transform_recovery_based_on_hash():
    transform1 = ln.Transform(key="my-transform", source_code="1").save()
    transform2 = ln.Transform(key="my-transform", source_code="1")
    assert transform1 == transform2
    transform1.delete()
    transform2 = ln.Transform(key="my-transform", source_code="1")
    assert transform1 != transform2
    transform1.delete(permanent=True)


def test_transform_recovery_based_on_key():
    transform1 = ln.Transform(key="my-transform").save()
    transform2 = ln.Transform(key="my-transform")
    assert transform1 == transform2
    transform1.delete()
    transform2 = ln.Transform(key="my-transform")
    assert transform1 != transform2
    transform1.delete(permanent=True)


def test_revise_transforms():
    # attempt to create a transform with an invalid version
    with pytest.raises(ValueError) as error:
        transform = ln.Transform(key="My transform", version=0)
        assert (
            error.exconly()
            == "ValueError: `version` parameter must be `None` or `str`, e.g., '0.1', '1',"
            " '2', etc."
        )

    # create a versioned transform
    transform = ln.Transform(key="My transform", version="1")
    assert transform.version_tag == "1"
    assert transform.version == "1"
    assert len(transform.uid) == ln.Transform._len_full_uid == 16
    assert len(transform.stem_uid) == ln.Transform._len_stem_uid == 12

    transform.save()

    # try to reload the same transform with the same uid
    transform_reload = ln.Transform(uid=transform.uid, key="My transform updated name")
    assert transform_reload.id == transform.id
    assert transform_reload.key == "My transform"  # unchanged, prints logging
    transform_reload = ln.Transform(
        uid=transform.uid, description="My transform updated name"
    )
    assert transform_reload.id == transform.id
    assert (
        transform_reload.description == "My transform updated name"
    )  # unchanged, prints logging

    # create new transform from old transform
    transform_r2 = ln.Transform(description="My 2nd transform", revises=transform)
    assert transform_r2.uid != transform.uid
    assert transform_r2.uid.endswith("0001")
    transform_r2 = ln.Transform(description="My 2nd transform", revises=transform)
    assert transform_r2.uid != transform.uid
    assert transform_r2.uid.endswith("0001")
    assert transform_r2.stem_uid == transform.stem_uid
    assert transform_r2.version_tag is None
    assert (
        transform_r2.version == transform_r2.uid[-4:]
    )  # version falls back to uid suffix
    assert transform_r2.is_latest
    assert transform.is_latest
    transform_r2.save()
    assert not transform.is_latest

    # create new transform from newly versioned transform
    transform_r3 = ln.Transform(
        description="My transform", revises=transform_r2, version="2"
    )
    assert transform_r3.stem_uid == transform.stem_uid
    assert transform_r3.version_tag == "2"
    assert transform_r3.version == "2"

    # default description
    transform_r3 = ln.Transform(revises=transform_r2)
    assert transform_r3.description == transform_r2.description

    # revise by matching on `key`
    key = "my-notebook.ipynb"
    transform_r2.key = key
    transform_r2.save()
    assert transform_r2.is_latest
    transform_r3 = ln.Transform(description="My transform", key=key, version="2")
    assert transform_r3.uid[:-4] == transform_r2.uid[:-4]
    assert transform_r3.uid.endswith("0001")
    # this only fires if source code was actually saved
    transform_r2.source_code = "something"
    transform_r2.save()
    transform_r3 = ln.Transform(description="My transform", key=key, version="2")
    assert transform_r3.uid[:-4] == transform_r2.uid[:-4]
    assert transform_r3.uid.endswith("0002")
    assert transform_r3.stem_uid == transform_r2.stem_uid
    assert transform_r3.key == key
    assert transform_r3.version_tag == "2"
    assert transform_r3.version == "2"
    assert transform_r3.is_latest
    # because the new transform isn't yet saved, the old transform still has
    # is_latest = True
    assert transform_r2.is_latest
    assert transform_r3._revises is not None
    transform_r3.save()
    # now r2 is no longer the latest version, but need to re-fresh from db
    transform_r2 = ln.Transform.get(transform_r2.uid)
    assert not transform_r2.is_latest

    # wrong transform type
    with pytest.raises(TypeError) as error:
        ln.Transform(revises=ln.Record(name="x"))
    assert error.exconly().startswith(
        "TypeError: `revises` has to be of type `Transform`"
    )

    # wrong kwargs
    with pytest.raises(ValueError) as error:
        ln.Transform(x=1)
        assert (
            error.exconly()
            == "ValueError: Only key, description, version_tag, type, revises,"
            " reference, reference_type can be passed, but you passed: {'x': 1}"
        )

    # test that reference transform cannot be deleted
    transform_r2.delete()
    transform.delete()

    # unversioned transform
    transform = ln.Transform(key="My transform")
    assert transform.version_tag is None
    assert transform.version == transform.uid[-4:]  # version falls back to uid suffix

    # what happens if we don't save the old transform?
    # add a test for it!
    transform.save()

    # create new transform from old transform
    new_transform = ln.Transform(description="My new transform", revises=transform)
    assert transform.version_tag is None
    assert transform.version == transform.uid[-4:]  # version falls back to uid suffix
    assert new_transform.stem_uid == transform.stem_uid
    assert new_transform.uid.endswith("0001")
    assert new_transform.version_tag is None
    assert (
        new_transform.version == new_transform.uid[-4:]
    )  # version falls back to uid suffix

    transform.delete(permanent=True)


def test_delete():
    # prepare the creation of a transform with its artifacts
    transform = ln.Transform(key="My transform").save()
    run = ln.Run(transform)
    report_path = Path("report.html")
    with open(report_path, "w") as f:
        f.write("a")
    environment_path = Path("environment.txt")
    with open(environment_path, "w") as f:
        f.write("c")
    report = ln.Artifact(report_path, description=f"Report of {run.uid}").save()
    report_path.unlink()
    report_path = report.path
    environment = ln.Artifact(environment_path, description="requirements.txt").save()
    environment_path.unlink()
    environment_path = environment.path
    transform.save()
    run.report = report
    run.environment = environment
    run.save()
    assert report_path.exists()
    assert environment_path.exists()
    # now delete everything (run artifacts are cleaned up in background subprocess)
    transform.delete(permanent=True)
    assert len(ln.Run.filter(id=run.id)) == 0
    # Clean up orphan report/env artifacts if subprocess has not run yet
    for art in [report, environment]:
        a = ln.Artifact.filter(id=art.id).first()
        if a is not None:
            a.delete(permanent=True, storage=True)
    assert not report_path.exists()
    assert not environment_path.exists()
    assert len(ln.Artifact.filter(id__in=[report.id, environment.id])) == 0


# see test_composite_component in test_schema.py
def test_successor_predecessor():
    predecessor = ln.Transform(key="predecessor").save()
    successor1 = ln.Transform(key="successor1").save()
    successor2 = ln.Transform(key="successor2").save()
    predecessor.successors.add(
        successor1, successor2, through_defaults={"config": {"param": 42}}
    )

    assert len(predecessor.successors.all()) == 2
    assert predecessor.links_successor.count() == 2
    assert predecessor.links_successor.first().config == {"param": 42}
    assert predecessor.links_successor.first().predecessor == predecessor
    assert predecessor.predecessors.count() == 0
    assert predecessor.links_predecessor.count() == 0

    ln.models.transform.TransformTransform.filter(predecessor=predecessor).delete(
        permanent=True
    )

    link = ln.models.transform.TransformTransform(
        predecessor=predecessor, successor=successor1, config={"param": 42}
    ).save()
    assert link in predecessor.links_successor.all()
    assert link in successor1.links_predecessor.all()
    assert link.config == {"param": 42}

    predecessor.delete(permanent=True)
    successor1.delete(permanent=True)
    successor2.delete(permanent=True)

    assert ln.models.transform.TransformTransform.filter().count() == 0


def test_bulk_transform_permanent_delete(tmp_path):
    """Bulk Transform permanent delete deletes TransformProject, runs (and artifacts), then transforms."""
    transform = ln.Transform(key="Bulk transform delete").save()
    runs = [ln.Run(transform).save() for _ in range(2)]
    report_files = [tmp_path / f"bulk_report_{i}.txt" for i in range(2)]
    for f in report_files:
        f.write_text("report content")
    report_artifacts = [
        ln.Artifact(str(f), description=f"report {i}").save()
        for i, f in enumerate(report_files)
    ]
    for run, art in zip(runs, report_artifacts):
        run.report = art
        run.save()
    transform_id = transform.id
    run_ids = [r.id for r in runs]
    artifact_ids = [r.report_id for r in runs]

    with patch("lamindb.models.run.subprocess.Popen") as mock_popen:
        ln.Transform.filter(id=transform_id).delete(permanent=True)
        mock_popen.assert_called_once()
        args = mock_popen.call_args[0][0]
        ids_str = args[args.index("--ids") + 1]
        assert {int(x) for x in ids_str.split(",")} == set(artifact_ids)

    assert ln.Transform.filter(id=transform_id).count() == 0
    for rid in run_ids:
        assert ln.Run.filter(id=rid).count() == 0
    # With mock, cleanup subprocess did not run; clean up orphan report artifacts
    for aid in artifact_ids:
        art = ln.Artifact.filter(id=aid).first()
        if art is not None:
            art.delete(permanent=True, storage=False)


def test_single_transform_permanent_delete_delegates_to_queryset(tmp_path):
    """Single Transform permanent delete delegates to QuerySet and removes runs and artifacts."""
    transform = ln.Transform(key="Single transform delete").save()
    run = ln.Run(transform).save()
    report_file = tmp_path / "single_report.txt"
    report_file.write_text("report")
    report = ln.Artifact(str(report_file), description="report").save()
    run.report = report
    run.save()
    transform_id = transform.id
    run_id = run.id
    artifact_id = report.id

    with patch("lamindb.models.run.subprocess.Popen") as mock_popen:
        transform.delete(permanent=True)
        mock_popen.assert_called_once()
        args = mock_popen.call_args[0][0]
        ids_str = args[args.index("--ids") + 1]
        assert artifact_id in {int(x) for x in ids_str.split(",")}

    assert ln.Transform.filter(id=transform_id).count() == 0
    assert ln.Run.filter(id=run_id).count() == 0
    # With mock, cleanup subprocess did not run; clean up orphan report artifact
    art = ln.Artifact.filter(id=artifact_id).first()
    if art is not None:
        art.delete(permanent=True, storage=False)


def test_bulk_transform_soft_delete():
    """Bulk Transform soft delete sets branch_id=-1."""
    transform = ln.Transform(key="Bulk transform soft delete").save()
    ln.Run(transform).save()
    transform_id = transform.id
    ln.Transform.filter(id=transform_id).delete(permanent=False)
    t = ln.Transform.filter(id=transform_id).one()
    assert t.branch_id == -1
    ln.Transform.filter(id=transform_id).delete(permanent=True)


def test_bulk_transform_permanent_delete_promotes_previous_version():
    """Bulk permanent delete of latest in a version family promotes the previous version."""
    v1 = ln.Transform(key="Bulk permanent delete version family").save()
    v2 = ln.Transform(revises=v1, key="Bulk permanent delete version family").save()
    assert v2.is_latest
    stem_uid = v1.stem_uid

    ln.Transform.filter(id=v2.id).delete(permanent=True)

    assert ln.Transform.filter(id=v2.id).count() == 0
    v1_after = ln.Transform.filter(uid__startswith=stem_uid).one()
    assert v1_after.pk == v1.pk
    assert v1_after.is_latest
    v1.delete(permanent=True)


def test_bulk_transform_soft_delete_promotes_previous_version():
    """Bulk soft delete of latest in a version family promotes the previous version."""
    v1 = ln.Transform(key="Bulk soft delete version family").save()
    v2 = ln.Transform(revises=v1, key="Bulk soft delete version family").save()
    assert v2.is_latest
    v2_id = v2.id
    stem_uid = v1.stem_uid

    ln.Transform.filter(id=v2_id).delete(permanent=False)

    v2_after = ln.Transform.filter(id=v2_id).one()
    assert v2_after.branch_id == -1
    assert not v2_after.is_latest
    v1.refresh_from_db()
    assert v1.is_latest
    assert ln.Transform.filter(uid__startswith=stem_uid).get(is_latest=True) == v1
    # Clean up
    v2_after.delete(permanent=True)
    v1.delete(permanent=True)


================================================
FILE: tests/core/test_transform_from_git.py
================================================
import lamindb as ln
import pytest

TEST_URL = "https://github.com/openproblems-bio/task_batch_integration"


def test_transform_from_git():
    # test auto-inferred latest commit hash
    transform1 = ln.Transform.from_git(url=TEST_URL, path="main.nf")
    assert transform1.source_code.startswith(f"""\
repo: {TEST_URL}
path: main.nf
commit:""")
    assert transform1.key == "openproblems-bio/task_batch_integration/main.nf"
    assert transform1.version_tag is None
    assert transform1.description is None
    assert transform1.reference.startswith(f"{TEST_URL}/blob/")
    assert transform1.reference_type == "url"

    # test checking out specific version
    transform2 = ln.Transform.from_git(url=TEST_URL, path="main.nf", version="v2.0.0")
    assert transform2.source_code.startswith(f"""\
repo: {TEST_URL}
path: main.nf
commit:""")
    assert transform2.version_tag == "v2.0.0"
    assert transform2.description is None
    assert transform1.source_code != transform2.source_code
    assert transform1.reference != transform2.reference

    # test with description
    transform2_with_desc = ln.Transform.from_git(
        url=TEST_URL, path="main.nf", version="v2.0.0", description="Test description"
    )
    assert transform2_with_desc.description == "Test description"
    assert transform2_with_desc.version_tag == "v2.0.0"

    # test sliding transform from branch
    transform3 = ln.Transform.from_git(
        url=TEST_URL, path="main.nf", version="main", branch="main"
    )
    assert transform3.source_code.startswith(f"""\
repo: {TEST_URL}
path: main.nf
branch:""")
    assert transform3.description is None
    assert transform3.reference == f"{TEST_URL}/tree/main/main.nf"
    assert transform3.reference_type == "url"


def test_transform_from_git_with_entrypoint():
    # test auto-inferred latest commit hash
    transform1 = ln.Transform.from_git(
        url=TEST_URL, path="main.nf", entrypoint="myentrypoint"
    )
    assert transform1.source_code.startswith(f"""\
repo: {TEST_URL}
path: main.nf
entrypoint: myentrypoint
commit:""")
    assert transform1.description is None

    # test with entrypoint and description
    transform2 = ln.Transform.from_git(
        url=TEST_URL,
        path="main.nf",
        entrypoint="myentrypoint",
        description="Entrypoint description",
    )
    assert transform2.description == "Entrypoint description"


def test_transform_custom_key_and_hash_lookup():
    # test auto-inferred latest commit hash
    transform1 = ln.Transform.from_git(
        url=TEST_URL, path="main.nf", key="mypipeline"
    ).save()
    assert transform1.key == "mypipeline"
    # trigger hash look up
    transform2 = ln.Transform.from_git(url=TEST_URL, path="main.nf", key="mypipeline2")
    assert transform1 == transform2
    assert transform2.key == "mypipeline"
    # trigger hash look up
    transform2 = ln.Transform.from_git(
        url=TEST_URL, path="main.nf", key="mypipeline2", skip_hash_lookup=True
    )
    assert transform1 != transform2
    assert transform2.key == "mypipeline2"
    transform1.delete(permanent=True)


def test_transform_from_git_failure_modes():
    # invalid tag
    with pytest.raises(ValueError) as error:
        ln.Transform.from_git(
            url=TEST_URL,
            path="main.nf",
            version="invalid",
        )
    assert error.exconly().startswith("ValueError: Failed to checkout version invalid")

    # invalid branch
    with pytest.raises(ValueError) as error:
        ln.Transform.from_git(
            url=TEST_URL,
            path="main.nf",
            branch="invalid",
        )
    assert error.exconly().startswith("ValueError: Failed to checkout branch invalid")


================================================
FILE: tests/core/test_view.py
================================================
import lamindb as ln


def test_view():
    ln.view(modules="core")
    ln.view()


================================================
FILE: tests/curators/conftest.py
================================================
import shutil
from time import perf_counter

import lamindb_setup as ln_setup
import pytest


def pytest_sessionstart():
    t_execute_start = perf_counter()
    ln_setup.init(storage="./test-curators-db", modules="bionty")
    total_time_elapsed = perf_counter() - t_execute_start
    print(f"time to setup the instance: {total_time_elapsed:.1f}s")


def pytest_sessionfinish(session: pytest.Session):
    shutil.rmtree("./test-curators-db")
    ln_setup.delete("test-curators-db", force=True)


@pytest.fixture
def ccaplog(caplog):
    """Add caplog handler to our custom logger at session start."""
    from lamin_utils._logger import logger

    logger.addHandler(caplog.handler)

    yield caplog

    logger.removeHandler(caplog.handler)


================================================
FILE: tests/curators/test_cellxgene_curation.py
================================================
from typing import Generator

import bionty as bt
import lamindb as ln
import pytest


@pytest.fixture
def cellxgene_defaults() -> Generator:
    ln.examples.cellxgene.save_cellxgene_defaults()

    yield

    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)
    ln.ULabel.filter(type__isnull=False).delete(permanent=True)
    for entity in [
        bt.Disease,
        bt.Ethnicity,
        bt.DevelopmentalStage,
        bt.Phenotype,
        bt.CellType,
        ln.ULabel,
    ]:
        entity.filter().delete(permanent=True)


def test_cellxgene_curation(cellxgene_defaults) -> None:
    """Tests validating a recent CELLxGENE dataset."""
    ln.examples.cellxgene.save_cellxgene_defaults()

    cxg_schema = ln.examples.cellxgene.create_cellxgene_schema(
        field_types="ontology_id",
        organism="mouse",
        spatial_library_id="Thymus_Visium_Exp3A_V2S1_3wk_B6-WT",
    )

    adata = ln.examples.datasets.anndata_visium_mouse_cellxgene()

    curator = ln.curators.AnnDataCurator(adata, cxg_schema)
    curator.validate()

    cxg_schema.delete(permanent=True)


================================================
FILE: tests/curators/test_curate_from_croissant.py
================================================
import shutil

import lamindb as ln
import pytest


@pytest.mark.parametrize("filepath_prefix", [None, "test-curators-db/"])
def test_curate_artifact_from_croissant(filepath_prefix: str | None):
    croissant_path, dataset1_path = ln.examples.croissant.mini_immuno(
        n_files=1, filepath_prefix=filepath_prefix
    )
    artifact1 = ln.integrations.curate_from_croissant(croissant_path)
    assert (
        artifact1.description
        == "Mini immuno dataset - A few samples from the immunology dataset"
    )
    assert artifact1.key == "mini_immuno.anndata.zarr"
    assert artifact1.version_tag == "1.0"
    assert (
        artifact1._key_is_virtual
        if filepath_prefix is None
        else not artifact1._key_is_virtual
    )
    license_label = artifact1.ulabels.get(
        name="https://creativecommons.org/licenses/by/4.0/"
    )
    project_label = artifact1.projects.get(name="Mini Immuno Project")

    # now mutate the dataset and create a new version
    croissant_path, dataset1_path = ln.examples.croissant.mini_immuno(
        n_files=1, filepath_prefix=filepath_prefix, strip_version=True
    )
    dummy_file_path = dataset1_path / "dummy_file.txt"
    dummy_file_path.write_text("dummy file")

    artifact2 = ln.integrations.curate_from_croissant(croissant_path)
    assert artifact2.description == artifact1.description
    assert artifact2.key == artifact1.key
    assert artifact2.version_tag is None
    assert artifact2.stem_uid == artifact1.stem_uid
    assert artifact2.uid != artifact1.uid
    assert (
        artifact2._key_is_virtual
        if filepath_prefix is None
        else not artifact1._key_is_virtual
    )
    license_label = artifact2.ulabels.get(
        name="https://creativecommons.org/licenses/by/4.0/"
    )
    project_label = artifact2.projects.get(name="Mini Immuno Project")

    shutil.rmtree(dataset1_path)
    croissant_path.unlink()
    artifact1.delete(permanent=True, storage=True)  # because of real storage key
    project_label.delete(permanent=True)
    license_label.delete(permanent=True)


def test_curate_collection_from_croissant():
    croissant_path, dataset1_path, dataset2_path = ln.examples.croissant.mini_immuno(
        n_files=2
    )
    collection = ln.integrations.curate_from_croissant(croissant_path)
    croissant_path.unlink()
    shutil.rmtree(dataset1_path)
    dataset2_path.unlink()
    artifact1 = collection.artifacts.get(key="mini_immuno.anndata.zarr")
    artifact2 = collection.artifacts.get(key="mini.csv")
    license_label = collection.ulabels.get(
        name="https://creativecommons.org/licenses/by/4.0/"
    )
    project_label = collection.projects.get(name="Mini Immuno Project")

    collection.delete(permanent=True)
    artifact1.delete(permanent=True)
    artifact2.delete(permanent=True)
    project_label.delete(permanent=True)
    license_label.delete(permanent=True)


================================================
FILE: tests/curators/test_curators_examples.py
================================================
import sys
from pathlib import Path

docs_path = Path.cwd() / "docs" / "scripts"
sys.path.append(str(docs_path))

import anndata as ad
import bionty as bt
import lamindb as ln
import pandas as pd
import pytest
from lamindb.core import datasets
from lamindb.errors import InvalidArgument, ValidationError


@pytest.fixture(scope="module")
def mini_immuno_schema():
    # define labels
    perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
    ln.ULabel(name="DMSO", type=perturbation).save()
    ln.ULabel(name="IFNG", type=perturbation).save()
    ln.ULabel(name="ulabel_but_not_perturbation").save()
    ln.ULabel.from_values(["sample1", "sample2", "sample3"], create=True).save()
    bt.CellType.from_source(name="B cell").save()
    bt.CellType.from_source(name="T cell").save()

    # in next iteration for attrs
    ln.Feature(name="temperature", dtype=float).save()
    # ln.Feature(name="experiment", dtype="cat[ULabel]").save()
    # ln.Feature(name="date_of_study", dtype="date").save()
    # ln.Feature(name="study_note", dtype="str").save()

    # define schema
    schema = ln.Schema(
        name="mini_immuno_obs_level_metadata_curator_tests",
        features=[
            ln.Feature(name="perturbation", dtype=perturbation).save(),
            ln.Feature(name="sample_note", dtype=str).save(),
            ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
            ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
        ],
        index=ln.Feature(name="sample_label", dtype=ln.ULabel).save(),
    ).save()

    yield schema

    for af in ln.Artifact.filter():
        af.delete(permanent=True)

    from lamindb.models import SchemaComponent

    SchemaComponent.filter().delete(permanent=True)
    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)
    bt.Gene.filter().delete(permanent=True)
    ln.ULabel.filter(type__isnull=False).delete(permanent=True)
    ln.ULabel.filter().delete(permanent=True)
    bt.CellType.filter().delete(permanent=True)


@pytest.fixture(scope="module")
def curator_params():
    """Common curator parameters."""
    return {
        "categoricals": {
            "perturbation": ln.ULabel.name,
            "cell_type_by_expert": bt.CellType.name,
            "cell_type_by_model": bt.CellType.name,
        },
        "organism": "human",
    }


@pytest.fixture(scope="module")
def mudata_papalexi21_subset_schema():
    # define labels
    perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
    ln.ULabel(name="Perturbed", type=perturbation).save()
    ln.ULabel(name="NT", type=perturbation).save()

    replicate = ln.ULabel(name="Replicate", is_type=True).save()
    ln.ULabel(name="rep1", type=replicate).save()
    ln.ULabel(name="rep2", type=replicate).save()
    ln.ULabel(name="rep3", type=replicate).save()

    # define obs schema
    obs_schema = ln.Schema(
        name="mudata_papalexi21_subset_obs_schema",
        features=[
            ln.Feature(name="perturbation", dtype=perturbation).save(),
            ln.Feature(name="replicate", dtype=replicate).save(),
        ],
    ).save()

    obs_schema_rna = ln.Schema(
        name="mudata_papalexi21_subset_rna_obs_schema",
        features=[
            ln.Feature(name="nCount_RNA", dtype=int).save(),
            ln.Feature(name="nFeature_RNA", dtype=int).save(),
            ln.Feature(name="percent.mito", dtype=float).save(),
        ],
        coerce=True,
    ).save()

    obs_schema_hto = ln.Schema(
        name="mudata_papalexi21_subset_hto_obs_schema",
        features=[
            ln.Feature(name="nCount_HTO", dtype=int).save(),
            ln.Feature(name="nFeature_HTO", dtype=int).save(),
            ln.Feature(name="technique", dtype=bt.ExperimentalFactor).save(),
        ],
        coerce=True,
    ).save()

    var_schema_rna = ln.Schema(
        name="mudata_papalexi21_subset_rna_var_schema",
        itype=bt.Gene.symbol,
        dtype=float,
    ).save()

    # define composite schema
    mudata_schema = ln.Schema(
        name="mudata_papalexi21_subset_mudata_schema",
        otype="MuData",
        slots={
            "obs": obs_schema,
            "rna:obs": obs_schema_rna,
            "hto:obs": obs_schema_hto,
            "rna:var": var_schema_rna,
        },
    ).save()

    yield mudata_schema

    for af in ln.Artifact.filter():
        af.delete(permanent=True)
    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)
    bt.models.SchemaGene.filter().delete()
    bt.Gene.filter().delete(permanent=True)
    ln.ULabel.filter(type__isnull=False).delete(permanent=True)
    ln.ULabel.filter().delete(permanent=True)
    bt.ExperimentalFactor.filter().delete(permanent=True)


@pytest.fixture(scope="module")
def study_metadata_schema():
    from define_schema_df_metadata import study_metadata_schema

    yield study_metadata_schema

    study_metadata_schema.delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


@pytest.fixture(scope="module")
def anndata_uns_schema():
    from define_schema_anndata_uns import anndata_uns_schema

    yield anndata_uns_schema

    ln.Schema.filter().delete(permanent=True)
    ln.Feature.filter().delete(permanent=True)


@pytest.fixture(scope="module")
def spatialdata_blobs_schema():
    from define_schema_spatialdata import sdata_schema

    yield sdata_schema

    for af in ln.Artifact.filter():
        af.delete(permanent=True)

    from lamindb.models import SchemaComponent

    SchemaComponent.filter().delete(permanent=True)

    ln.Schema.filter().delete(permanent=True)
    bt.models.SchemaGene.filter().delete()
    bt.Gene.filter().delete(permanent=True)
    ln.ULabel.filter(type__isnull=False).delete(permanent=True)
    ln.ULabel.filter().delete(permanent=True)
    bt.ExperimentalFactor.filter().delete(permanent=True)
    bt.DevelopmentalStage.filter().delete(permanent=True)
    bt.Disease.filter().delete(permanent=True)


def test_dataframe_curator(mini_immuno_schema: ln.Schema):
    """Test DataFrame curator implementation."""

    # Get the perturbation ULabel (created in mini_immuno_schema fixture)
    perturbation = ln.ULabel.get(name="Perturbation", is_type=True)

    # invalid simple dtype (float)
    feature_to_fail = ln.Feature(name="treatment_time_h", dtype=float).save()
    schema = ln.Schema(
        name="mini_immuno_obs_level_metadata_v2",
        features=[
            ln.Feature(name="perturbation", dtype=perturbation).save(),
            ln.Feature(name="sample_note", dtype=str).save(),
            ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
            ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
            feature_to_fail,
        ],
    ).save()
    df = datasets.mini_immuno.get_dataset1(otype="DataFrame")
    curator = ln.curators.DataFrameCurator(df, schema)
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert (
        "Column 'treatment_time_h' failed series or dataframe validator 0: <Check check_function: Column 'treatment_time_h' failed dtype check for 'float': got int64>"
        in error.exconly()
    )

    schema.delete(permanent=True)
    feature_to_fail.delete(permanent=True)

    # Wrong subtype
    df = datasets.mini_immuno.get_dataset1(otype="DataFrame", with_wrong_subtype=True)
    curator = ln.curators.DataFrameCurator(df, mini_immuno_schema)
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert (
        error.exconly()
        == """lamindb.errors.ValidationError: 1 term not validated in feature 'perturbation': 'ulabel_but_not_perturbation'
    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('perturbation')
    → a valid label for subtype 'Perturbation' has to be one of ['DMSO', 'IFNG']"""
    )

    # Typo
    df = datasets.mini_immuno.get_dataset1(otype="DataFrame", with_typo=True)
    curator = ln.curators.DataFrameCurator(df, mini_immuno_schema)
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert (
        error.exconly()
        == """lamindb.errors.ValidationError: 1 term not validated in feature 'perturbation': 'IFNJ'
    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('perturbation')
    → a valid label for subtype 'Perturbation' has to be one of ['DMSO', 'IFNG']"""
    )

    df = datasets.mini_immuno.get_dataset1(otype="DataFrame")
    curator = ln.curators.DataFrameCurator(df, mini_immuno_schema)
    artifact = curator.save_artifact(key="examples/dataset1.parquet")

    assert artifact.schema == mini_immuno_schema
    assert artifact.features.slots["columns"].n_members == 5
    assert (
        artifact.features.describe(return_str=True)
        == """\
Artifact: examples/dataset1.parquet (0000)
└── Dataset features
    └── columns (5)
        cell_type_by_expe…  bionty.CellType          B cell, CD8-positive, alph…
        cell_type_by_model  bionty.CellType          B cell, T cell
        perturbation        ULabel[Perturbation]     DMSO, IFNG
        sample_label        ULabel                   sample1, sample2, sample3
        sample_note         str"""
    )
    assert set(artifact.features.get_values()["sample_label"]) == {
        "sample1",
        "sample2",
        "sample3",
    }
    assert set(artifact.features.get_values()["cell_type_by_expert"]) == {
        "CD8-positive, alpha-beta T cell",
        "B cell",
    }
    assert set(artifact.features.get_values()["cell_type_by_model"]) == {
        "T cell",
        "B cell",
    }

    # a second dataset with missing values
    ln.ULabel.from_values(["sample4", "sample5", "sample6"], create=True).save()
    df = ln.examples.datasets.mini_immuno.get_dataset2(
        otype="DataFrame", gene_symbols_in_index=True
    )
    curator = ln.curators.DataFrameCurator(df, mini_immuno_schema)
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert "column 'sample_note' not in dataframe" in error.exconly()
    assert "column 'cell_type_by_expert' not in dataframe" in error.exconly()

    curator.standardize()
    curator.validate()

    artifact.delete(permanent=True)


def test_dataframe_curator_index():
    """Test validating a DataFrame index."""
    df = datasets.mini_immuno.get_dataset1(
        otype="DataFrame", with_index_type_mismatch=True
    )
    feature = ln.Feature(name="test", dtype="str").save()
    schema = ln.Schema(index=feature).save()
    curator = ln.curators.DataFrameCurator(df, schema)
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert "expected series 'None' to have type str" in error.exconly()

    schema.delete(permanent=True)
    feature.delete(permanent=True)


def test_dataframe_curator_validate_all_annotate_cat(mini_immuno_schema):
    """Do not pass any features."""
    schema = ln.Schema(itype=ln.Feature).save()
    assert schema.flexible

    df = datasets.mini_immuno.get_dataset1(otype="DataFrame")
    artifact = ln.Artifact.from_dataframe(
        df, key="examples/dataset1.parquet", schema=schema
    ).save()
    assert set(artifact.features.get_values()["perturbation"]) == {
        "DMSO",
        "IFNG",
    }
    assert set(artifact.features.get_values()["cell_type_by_expert"]) == {
        "CD8-positive, alpha-beta T cell",
        "B cell",
    }
    assert set(artifact.features.get_values()["cell_type_by_model"]) == {
        "T cell",
        "B cell",
    }

    artifact.delete(permanent=True)
    schema.delete(permanent=True)


def test_same_name_different_type():
    """The same feature names are allowed as long as they have different feature types."""
    type_a = ln.Feature(
        name="TypeA", is_type=True, description="Type A features"
    ).save()
    type_b = ln.Feature(
        name="TypeB", is_type=True, description="Type B features"
    ).save()

    assay_a = ln.Feature(name="assay name", type=type_a, dtype=str).save()
    assay_b = ln.Feature(name="assay name", type=type_b, dtype=str).save()

    schema = ln.Schema(
        name="schema_a",
        features=[ln.Feature.get(name="assay name", type=type_a)],
        flexible=True,
        otype="DataFrame",
    ).save()

    df = pd.DataFrame({"assay name": ["exp1", "exp2"]})

    artifact = ln.Artifact.from_dataframe(df, description="testdata").save()

    curator = ln.curators.DataFrameCurator(artifact, schema)
    curator.save_artifact()

    artifact.delete(permanent=True)
    ln.Schema.filter(features__name="assay name").delete(permanent=True)
    schema.delete(permanent=True)
    for feat in [assay_a, assay_b, type_a, type_b]:
        feat.delete(permanent=True)


def test_dataframe_curator_validate_all_annotate_cat2(mini_immuno_schema):
    """Combine half-specifying features, half not."""
    schema = ln.Schema(
        itype=ln.Feature,
        features=[ln.Feature.get(name="perturbation")],
        flexible=True,
    ).save()
    assert schema.flexible

    df = datasets.mini_immuno.get_dataset1(otype="DataFrame")
    curator = ln.curators.DataFrameCurator(df, schema)
    artifact = curator.save_artifact(key="examples/dataset1.parquet")
    assert set(artifact.features.get_values()["perturbation"]) == {
        "DMSO",
        "IFNG",
    }
    assert set(artifact.features.get_values()["cell_type_by_expert"]) == {
        "CD8-positive, alpha-beta T cell",
        "B cell",
    }
    assert set(artifact.features.get_values()["cell_type_by_model"]) == {
        "T cell",
        "B cell",
    }

    artifact.delete(permanent=True)
    schema.delete(permanent=True)


@pytest.mark.parametrize("include_attrs_slot", [True, False])
def test_dataframe_attrs_validation(study_metadata_schema, include_attrs_slot):
    df = datasets.mini_immuno.get_dataset1(otype="DataFrame")

    perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
    perturbation_feature = ln.Feature(name="perturbation", dtype=perturbation).save()
    ln.ULabel(name="DMSO", type=perturbation).save()
    ln.ULabel(name="IFNG", type=perturbation).save()

    if include_attrs_slot:
        schema = ln.Schema(
            features=[perturbation_feature],
            slots={"attrs": study_metadata_schema},
            otype="DataFrame",
        ).save()
    else:
        schema = ln.Schema(
            features=[perturbation_feature],
            otype="DataFrame",
        ).save()

    bad_schema = ln.Schema(
        features=[perturbation_feature],
        slots={"doesnotexist": schema},
        otype="DataFrame",
    ).save()

    with pytest.raises(ValueError) as e:
        curator = ln.curators.DataFrameCurator(df, schema=bad_schema)
    assert (
        "Slot 'doesnotexist' is not supported for DataFrameCurator. Must be 'attrs'."
        in str(e.value)
    )

    curator = ln.curators.DataFrameCurator(df, schema=schema)

    if include_attrs_slot:
        assert curator.slots["attrs"].__class__.__name__ == "ComponentCurator"
    else:
        assert not curator.slots

    curator.validate()
    artifact = curator.save_artifact(key="examples/df_with_attrs.parquet")

    assert artifact.schema == schema
    if include_attrs_slot:
        assert "attrs" in artifact.features.slots
        assert artifact.features.slots["attrs"].features.first() == ln.Feature.get(
            name="temperature"
        )
        assert artifact.features.slots["attrs"].features.last() == ln.Feature.get(
            name="experiment"
        )
    else:
        assert (
            not hasattr(artifact.features, "slots")
            or "attrs" not in artifact.features.slots
        )

    from lamindb.models import SchemaComponent

    SchemaComponent.filter().delete(permanent=True)
    artifact.delete(permanent=True)
    bad_schema.delete(permanent=True)
    schema.delete(permanent=True)


def test_schema_new_genes(ccaplog):
    df = pd.DataFrame(
        index=pd.Index(
            [
                "ENSG00000139618",  # BRCA2
                "ENSG00000141510",  # TP53
                "ENSG00999000001",  # Invalid ID
                "ENSG00999000002",  # Invalid ID
            ],
            name="ensembl",
        )
    )
    feature = ln.Feature(name="ensembl", dtype=bt.Gene.ensembl_gene_id).save()
    schema = ln.Schema(index=feature).save()
    curator = ln.curators.DataFrameCurator(df, schema)
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert error.exconly().startswith(
        "lamindb.errors.ValidationError: 2 terms not validated in feature 'index': 'ENSG00999000001', 'ENSG00999000002'"
    )

    assert (
        "2 terms not validated in feature 'index': 'ENSG00999000001', 'ENSG00999000002'"
        in ccaplog.text
    )

    schema.delete(permanent=True)
    feature.delete(permanent=True)


def test_schema_no_match_ensembl():
    df = pd.DataFrame(
        index=pd.Index(
            [
                "ENSG99999999998",  # Invalid ID
                "ENSG99999999999",  # Invalid ID
            ],
            name="ensembl",
        )
    )
    schema = ln.Schema(
        index=ln.Feature(name="ensembl", dtype=bt.Gene.ensembl_gene_id).save()
    ).save()
    curator = ln.curators.DataFrameCurator(df, schema)
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert (
        error.exconly()
        == """lamindb.errors.ValidationError: 2 terms not validated in feature 'index': 'ENSG99999999998', 'ENSG99999999999'
    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('index')"""
    )

    schema.delete(permanent=True)


def test_schema_mixed_ensembl_symbols(ccaplog):
    """Quite some datasets have mixed ensembl gene IDs and symbols.

    The expected behavior is that an error is raised when such a dataset is encountered because
    currently LaminDB does not support validating values against a union of Fields.

    The current behavior is that these cases automatically pass.
    """
    df = pd.DataFrame(
        index=pd.Index(
            [
                "ENSG00000139618",
                "ENSG00000141510",
                "BRCA2",  # symbol
                "TP53",  # symbol
            ],
            name="ensembl",
        )
    )
    schema = ln.Schema(
        index=ln.Feature(name="ensembl", dtype=bt.Gene.ensembl_gene_id).save()
    ).save()
    curator = ln.curators.DataFrameCurator(df, schema)
    with pytest.raises(ln.errors.ValidationError) as error:
        curator.validate()
    assert error.exconly().startswith(
        "lamindb.errors.ValidationError: 2 terms not validated in feature 'index': 'BRCA2', 'TP53'"
    )

    assert "2 terms not validated in feature 'index': 'BRCA2', 'TP53'" in ccaplog.text

    schema.delete(permanent=True)


def test_schema_mixed_features(ccaplog):
    """Test that union dtype features validate against multiple registries."""

    mixed_feature = ln.Feature(
        name="mixed_feature",
        dtype="cat[bionty.Tissue.ontology_id|bionty.CellType.ontology_id]",
    ).save()

    df_mixed = pd.DataFrame({"mixed_feature": ["UBERON:0000178", "CL:0000540"]})
    mixed_schema = ln.Schema(features=[mixed_feature], coerce=True).save()

    mixed_curator = ln.curators.DataFrameCurator(df_mixed, mixed_schema)
    mixed_curator.validate()
    assert mixed_curator._is_validated

    assert bt.CellType.filter(ontology_id="CL:0000540").exists()
    assert bt.Tissue.filter(ontology_id="UBERON:0000178").exists()

    df_invalid = pd.DataFrame({"mixed_feature": ["INVALID:0000000"]})
    invalid_curator = ln.curators.DataFrameCurator(df_invalid, mixed_schema)
    with pytest.raises(ln.errors.ValidationError):
        invalid_curator.validate()

    mixed_schema.delete(permanent=True)
    mixed_feature.delete(permanent=True)


def test_anndata_curator_different_components(mini_immuno_schema: ln.Schema):
    obs_schema = mini_immuno_schema

    for add_comp in ["var.T", "obs", "uns"]:
        var_schema = ln.Schema(
            name="scRNA_seq_var_schema",
            itype=bt.Gene.ensembl_gene_id,
            dtype="num",
        ).save()

        # always assume var
        components = {"var.T": var_schema}
        if add_comp == "obs":
            components["obs"] = obs_schema
        if add_comp == "uns":
            uns_schema = ln.Schema(
                name="flexible_uns_schema",
                itype=ln.Feature,
            ).save()
            components["uns"] = uns_schema

        anndata_schema = ln.Schema(
            name="mini_immuno_anndata_schema",
            otype="AnnData",
            slots=components,
        ).save()
        assert mini_immuno_schema.id is not None, mini_immuno_schema
        assert anndata_schema.slots["var.T"] == var_schema
        if add_comp == "obs":
            assert anndata_schema.slots["obs"] == obs_schema
        if add_comp == "uns":
            assert anndata_schema.slots["uns"] == uns_schema

        describe_output = anndata_schema.describe(return_str=True)
        assert "mini_immuno_anndata_schema" in describe_output
        assert "scRNA_seq_var_schema" in describe_output
        if add_comp == "obs":
            assert "mini_immuno_anndata_schema" in describe_output
        if add_comp == "uns":
            assert "flexible_uns_schema" in describe_output

        adata = datasets.mini_immuno.get_dataset1(otype="AnnData")
        curator = ln.curators.AnnDataCurator(adata, anndata_schema)
        assert curator.slots["var.T"].__class__.__name__ == "ComponentCurator"
        if add_comp == "obs":
            assert curator.slots["obs"].__class__.__name__ == "ComponentCurator"
        if add_comp == "uns":
            assert curator.slots["uns"].__class__.__name__ == "ComponentCurator"

        artifact = ln.Artifact.from_anndata(
            adata, key="examples/dataset1.h5ad", schema=anndata_schema
        )
        assert artifact._curator._is_validated  # important test, do not remove
        artifact.save()
        assert not hasattr(artifact, "_curator")  # test that curator is deleted
        assert artifact.schema == anndata_schema
        assert artifact.features.slots["var.T"].n_members == 3  # 3 genes get linked
        if add_comp == "obs":
            assert artifact.features.slots["obs"] == obs_schema
            assert set(artifact.features.get_values()["cell_type_by_expert"]) == {
                "CD8-positive, alpha-beta T cell",
                "B cell",
            }
            assert set(artifact.features.get_values()["cell_type_by_model"]) == {
                "T cell",
                "B cell",
            }
        if add_comp == "uns":
            assert artifact.features.slots["uns"].features.first() == ln.Feature.get(
                name="temperature"
            )

        artifact.delete(permanent=True)
        anndata_schema.delete(permanent=True)
        var_schema.delete(permanent=True)


def test_anndata_curator_varT_curation():
    ln.Schema.filter(itype="bionty.Gene.ensembl_gene_id").delete()
    varT_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save()
    slot = "var.T"
    components = {slot: varT_schema}
    anndata_schema = ln.Schema(
        otype="AnnData",
        slots=components,
    ).save()
    for with_gene_typo in [True, False]:
        adata = datasets.mini_immuno.get_dataset1(
            otype="AnnData", with_gene_typo=with_gene_typo
        )
        if with_gene_typo:
            with pytest.raises(ValidationError) as error:
                artifact = ln.Artifact.from_anndata(
                    adata, key="examples/dataset1.h5ad", schema=anndata_schema
                ).save()
            assert error.exconly() == (
                f"lamindb.errors.ValidationError: 1 term not validated in feature 'columns' in slot '{slot}': 'GeneTypo'\n"
                f"    → fix typos, remove non-existent values, or save terms via: curator.slots['{slot}'].cat.add_new_from('columns')"
            )
        else:
            for n_max_records in [2, 4]:
                ln.settings.annotation.n_max_records = n_max_records
                artifact = ln.Artifact.from_anndata(
                    adata, key="examples/dataset1.h5ad", schema=anndata_schema
                ).save()
                assert (
                    artifact.features.slots[slot].n_members == 3
                )  # 3 genes get linked
                assert (
                    artifact.features.slots[slot].itype == "bionty.Gene.ensembl_gene_id"
                )
                if n_max_records == 2:
                    assert not artifact.features.slots[slot].members.exists()
                else:
                    assert set(
                        artifact.features.slots[slot]
                        .members.to_dataframe()["ensembl_gene_id"]
                        .tolist()
                    ) == {
                        "ENSG00000153563",
                        "ENSG00000010610",
                        "ENSG00000170458",
                    }

                artifact.delete(permanent=True)

            anndata_schema.delete(permanent=True)
            varT_schema.delete(permanent=True)


def test_anndata_curator_varT_curation_legacy(ccaplog):
    varT_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save()
    slot = "var"
    components = {slot: varT_schema}
    anndata_schema = ln.Schema(
        otype="AnnData",
        slots=components,
    ).save()
    for with_gene_typo in [True, False]:
        adata = datasets.mini_immuno.get_dataset1(
            otype="AnnData", with_gene_typo=with_gene_typo
        )
        if with_gene_typo:
            with pytest.raises(ValidationError) as error:
                artifact = ln.Artifact.from_anndata(
                    adata, key="examples/dataset1.h5ad", schema=anndata_schema
                ).save()
            assert error.exconly() == (
                f"lamindb.errors.ValidationError: 1 term not validated in feature 'var_index' in slot '{slot}': 'GeneTypo'\n"
                f"    → fix typos, remove non-existent values, or save terms via: curator.slots['{slot}'].cat.add_new_from('var_index')"
            )
        else:
            artifact = ln.Artifact.from_anndata(
                adata, key="examples/dataset1.h5ad", schema=anndata_schema
            ).save()
            assert (
                "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
                in ccaplog.text
            )
            assert artifact.features.slots[slot].n_members == 3  # 3 genes get linked
            assert set(
                artifact.features.slots[slot].members.to_dataframe()["ensembl_gene_id"]
            ) == {
                "ENSG00000153563",
                "ENSG00000010610",
                "ENSG00000170458",
            }

            artifact.delete(permanent=True)

            anndata_schema.delete(permanent=True)
            varT_schema.delete(permanent=True)


def test_anndata_curator_nested_uns(study_metadata_schema, anndata_uns_schema):
    """Test AnnDataCurator with nested uns slot validation."""
    adata = datasets.mini_immuno.get_dataset1(otype="AnnData")
    adata.uns["study_metadata"] = adata.uns.copy()

    curator = ln.curators.AnnDataCurator(adata, anndata_uns_schema)
    assert curator.slots["uns:study_metadata"].__class__.__name__ == "ComponentCurator"

    curator.validate()
    artifact = curator.save_artifact(key="examples/anndata_with_uns.h5ad")

    assert artifact.schema == anndata_uns_schema
    assert "uns:study_metadata" in artifact.features.slots
    assert artifact.features.slots[
        "uns:study_metadata"
    ].features.first() == ln.Feature.get(name="temperature")

    adata = datasets.mini_immuno.get_dataset1(otype="AnnData")
    bad_schema1 = ln.Schema(
        otype="AnnData",
        slots={"uns:nonexistent": study_metadata_schema},
    ).save()
    with pytest.raises(InvalidArgument) as e:
        ln.curators.AnnDataCurator(adata, bad_schema1)
    assert (
        "Schema slot 'uns:nonexistent' requires keys uns['nonexistent'] but key 'nonexistent' not found."
        in str(e.value)
    )

    with pytest.raises(InvalidArgument) as e:
        bad_schema2 = ln.Schema(
            otype="AnnData",
            slots={"uns:temperature:nonexistent_nested": study_metadata_schema},
        ).save()
        ln.curators.AnnDataCurator(adata, bad_schema2)
    assert (
        "Schema slot 'uns:temperature:nonexistent_nested' requires keys uns['temperature']['nonexistent_nested'] but key 'nonexistent_nested' not found. Available keys at this level: none (not a dict)."
        in str(e.value)
    )

    inferred_sets = artifact.schemas.all()
    for inferred_set in inferred_sets:
        artifact.schemas.remove(inferred_set)
    artifact.delete(permanent=True)
    bad_schema1.delete(permanent=True)
    bad_schema2.delete(permanent=True)
    anndata_uns_schema.delete(permanent=True)


def test_anndata_curator_no_var(mini_immuno_schema: ln.Schema):
    assert mini_immuno_schema.id is not None, mini_immuno_schema
    # test no var schema
    anndata_schema_no_var = ln.Schema(
        name="mini_immuno_anndata_schema_no_var",
        otype="AnnData",
        slots={"obs": mini_immuno_schema},
    ).save()
    assert mini_immuno_schema.id is not None, mini_immuno_schema
    adata = datasets.mini_immuno.get_dataset1(otype="AnnData")
    curator = ln.curators.AnnDataCurator(adata, anndata_schema_no_var)

    artifact = curator.save_artifact(key="examples/dataset1_no_var.h5ad")
    artifact.delete(permanent=True)
    anndata_schema_no_var.delete(permanent=True)


def test_mudata_curator(
    mudata_papalexi21_subset_schema: ln.Schema, mini_immuno_schema: ln.Schema
):
    mudata_schema = mudata_papalexi21_subset_schema
    mdata = ln.examples.datasets.mudata_papalexi21_subset()
    # wrong dataset
    with pytest.raises(InvalidArgument):
        ln.curators.MuDataCurator(pd.DataFrame(), mudata_schema)
    # wrong schema
    with pytest.raises(InvalidArgument):
        ln.curators.MuDataCurator(mdata, mini_immuno_schema)
    try:
        # TODO: allow set cat_filters for a Schema with itype
        bt.settings.organism = "human"
        curator = ln.curators.MuDataCurator(mdata, mudata_schema)
        assert curator.slots.keys() == {
            "obs",
            "rna:obs",
            "hto:obs",
            "rna:var",
        }
        curator.validate()
        curator.slots["rna:var"].cat.standardize("columns")
        curator.slots["rna:var"].cat.add_new_from("columns")
        artifact = curator.save_artifact(key="mudata_papalexi21_subset.h5mu")
        assert artifact.schema == mudata_schema
        assert set(artifact.features.slots.keys()) == {
            "obs",
            "rna:var",
            "rna:obs",
            "hto:obs",
        }

        artifact.delete(permanent=True)
        mudata_schema.delete(permanent=True)
        mini_immuno_schema.delete(permanent=True)
        Path("papalexi21_subset.h5mu").unlink(missing_ok=True)
    finally:
        bt.settings.organism = None


def test_mudata_curator_nested_uns(study_metadata_schema):
    """Test MuData with nested uns slot validation.

    This test verifies the behavior of both the MuData `.uns` slots and a `.uns` slot of
    an AnnData object inside the MuData object that gets specified using the key `:` syntax.
    """
    mdata = ln.examples.datasets.mudata_papalexi21_subset(with_uns=True)

    site_uns_schema = ln.Schema(
        features=[
            ln.Feature(name="pos", dtype=float).save(),
            ln.Feature(name="site_id", dtype=str).save(),
        ]
    ).save()

    mdata_schema = ln.Schema(
        otype="MuData",
        slots={
            "uns:study_metadata": study_metadata_schema,
            "rna:uns:site_metadata": site_uns_schema,
        },
    ).save()

    curator = ln.curators.MuDataCurator(mdata, mdata_schema)
    assert curator.slots["uns:study_metadata"].__class__.__name__ == "ComponentCurator"
    assert (
        curator.slots["rna:uns:site_metadata"].__class__.__name__ == "ComponentCurator"
    )

    curator.validate()
    artifact = curator.save_artifact(key="examples/mdata_with_uns.h5mu")

    assert artifact.schema == mdata_schema
    assert "uns:study_metadata" in artifact.features.slots
    assert "rna:uns:site_metadata" in artifact.features.slots
    assert artifact.features.slots[
        "uns:study_metadata"
    ].features.first() == ln.Feature.get(name="temperature")
    assert artifact.features.slots[
        "rna:uns:site_metadata"
    ].features.first() == ln.Feature.get(name="pos")

    # Clean up
    artifact.delete(permanent=True)
    Path("papalexi21_subset.h5mu").unlink(missing_ok=True)


def test_spatialdata_curator(
    spatialdata_blobs_schema: ln.Schema,
):
    spatialdata = ln.examples.datasets.spatialdata_blobs()

    # wrong dataset
    with pytest.raises(InvalidArgument):
        ln.curators.SpatialDataCurator(pd.DataFrame(), spatialdata_blobs_schema)
    # wrong schema - use an actual slot that exists
    with pytest.raises(InvalidArgument):
        ln.curators.SpatialDataCurator(
            spatialdata, spatialdata_blobs_schema.slots["attrs:bio"]
        )

    curator = ln.curators.SpatialDataCurator(spatialdata, spatialdata_blobs_schema)
    with pytest.raises(ln.errors.ValidationError):
        curator.validate()

    spatialdata.tables["table"].var.drop(index="ENSG00000999999", inplace=True)
    artifact = ln.Artifact.from_spatialdata(
        spatialdata,
        key="examples/spatialdata1.zarr",
        schema=spatialdata_blobs_schema,
    ).save()
    assert artifact.schema == spatialdata_blobs_schema
    assert artifact.features.slots.keys() == {
        "attrs:bio",
        "attrs:tech",
        "attrs",
        "tables:table:obs",
        "tables:table:var.T",
    }
    assert artifact.features.get_values()["disease"] == "Alzheimer disease"
    assert (
        artifact.features.describe(return_str=True)
        == """Artifact: examples/spatialdata1.zarr (0000)
└── Dataset features
    ├── attrs:bio (2)
    │   developmental_sta…  bionty.DevelopmentalSt…  adult stage
    │   disease             bionty.Disease           Alzheimer disease
    ├── attrs:tech (1)
    │   assay               bionty.ExperimentalFac…  Visium Spatial Gene Expres…
    ├── attrs (2)
    │   bio                 dict
    │   tech                dict
    ├── tables:table:obs …
    │   sample_region       str
    └── tables:table:var.…
        BRAF                num
        BRCA2               num"""
    )

    artifact.delete(permanent=True)


def test_specific_source():
    """Test validation of ontology terms using cat_filters to specify organism-specific source."""
    obs_schema = ln.Schema(
        features=[
            ln.Feature(
                name="developmental_stage_ontology_id",
                dtype=bt.DevelopmentalStage.ontology_id,
                cat_filters={
                    "source": bt.Source.filter(
                        entity="bionty.DevelopmentalStage", organism="mouse"
                    ).one()
                },
            ).save()
        ],
        coerce=True,
        minimal_set=False,
    ).save()

    schema = ln.Schema(
        slots={"obs": obs_schema}, otype="AnnData", minimal_set=True, coerce=True
    ).save()

    adata = ad.AnnData(
        obs=pd.DataFrame(
            {
                "developmental_stage_ontology_id": [
                    "MmusDv:0000142",
                    "MmusDv:0000022",
                ]
            }
        ),
        var=pd.DataFrame(index=["ENSMUSG00000022391", "ENSMUSG00000018569"]),
    )

    curator = ln.curators.AnnDataCurator(adata, schema)
    curator.validate()

    schema.delete(permanent=True)


================================================
FILE: tests/curators/test_curators_remote.py
================================================
import lamindb as ln


def test_curator_remote():
    lamindata_artifacts = ln.Artifact.connect("laminlabs/lamindata")
    curator = ln.curators.DataFrameCurator(
        lamindata_artifacts.get("Ywz5JiVNHOWSJDiK"),
        schema=ln.examples.schemas.valid_features(),
    )
    curator.validate()


================================================
FILE: tests/curators/test_dataframe_curation.py
================================================
"""Test suite for accounting on bank transactions."""

import datetime

import lamindb as ln
import pandas as pd
import pytest


@pytest.fixture(scope="module")
def transactions_schema():
    # Labels
    currency_type = ln.ULabel(name="Currency", is_type=True).save()
    usd = ln.ULabel(name="USD", type=currency_type).save()
    eur = ln.ULabel(name="EUR", type=currency_type).save()

    assert usd.type == currency_type
    assert eur.type == currency_type

    # Features
    currency = ln.Feature(name="currency_name", dtype="cat[ULabel[Currency]]").save()
    date = ln.Feature(name="date", dtype="date").save()
    receipt_url = ln.Feature(name="receipt_url", dtype="url").save()

    transaction_type = ln.Feature(name="Transaction", is_type=True).save()
    amount_usd = ln.Feature(
        name="transaction_amount_usd_cent", dtype=int, type=transaction_type
    ).save()
    amount_eur = ln.Feature(
        name="transaction_amount_eur_cent", dtype=int, type=transaction_type
    ).save()

    # Schema
    schema = ln.Schema(
        name="transaction_dataframe",
        otype="DataFrame",
        features=[
            date,
            amount_usd,
            amount_eur,
            currency,
            receipt_url,
        ],
        coerce=True,
    ).save()

    yield schema

    ln.Schema.filter(
        features__name__in=[
            "transaction_amount_eur_cent",
            "transaction_amount_usd_cent",
        ]
    ).delete(permanent=True)
    schema.delete(permanent=True)
    amount_eur.delete(permanent=True)
    amount_usd.delete(permanent=True)
    transaction_type.delete(permanent=True)
    date.delete(permanent=True)
    receipt_url.delete(permanent=True)
    currency.delete(permanent=True)
    eur.delete(permanent=True)
    usd.delete(permanent=True)
    currency_type.delete(permanent=True)


@pytest.fixture
def transactions_dataframe():
    # Create sample data
    data = {
        "date": [
            datetime.date(2024, 1, 1),
            datetime.date(2024, 1, 2),
            datetime.date(2024, 1, 3),
            datetime.date(2024, 1, 4),
            datetime.date(2024, 1, 5),
        ],
        "transaction_amount_usd_cent": [1000, 2000, 3000, 4000, 5000],
        "transaction_amount_eur_cent": [850, 1700, 2550, 3400, 4250],
        "currency_name": ["USD", "EUR", "USD", "EUR", "USD"],
        "receipt_url": [
            "https://bank.example/tx/1",
            "https://bank.example/tx/2",
            "https://bank.example/tx/3",
            "https://bank.example/tx/4",
            "https://bank.example/tx/5",
        ],
    }
    return pd.DataFrame(data)


def test_schema_creation(transactions_schema):
    """Test if schema was created properly"""
    schema = ln.Schema.get(name="transaction_dataframe")
    assert schema is not None
    assert schema.otype == "DataFrame"
    # check the order of the features
    assert schema.members.to_list("name") == [
        "date",
        "transaction_amount_usd_cent",
        "transaction_amount_eur_cent",
        "currency_name",
        "receipt_url",
    ]


def test_data_curation(
    transactions_schema: ln.Schema, transactions_dataframe: ln.Schema
):
    """Test if data curation works properly"""
    curator = ln.curators.DataFrameCurator(transactions_dataframe, transactions_schema)
    assert curator.validate() is None
    # URLs are currently validated as string values.
    assert transactions_dataframe["receipt_url"].iloc[0] == "https://bank.example/tx/1"
    artifact = curator.save_artifact(key="test_transaction_dataset.csv")
    assert artifact.suffix == ".csv"
    artifact.delete(permanent=True)


def test_missing_required_feature(transactions_schema: ln.Schema):
    """Test if validation fails for invalid data"""
    data_missing_required_feature = {
        "date": [datetime.date(2024, 1, 1)],
        "transaction_amount_usd_cent": [1000],
        "currency_name": ["USD"],
        "receipt_url": ["https://bank.example/tx/1"],
    }
    invalid_df = pd.DataFrame(data_missing_required_feature)

    schema = ln.Schema.get(name="transaction_dataframe")
    curator = ln.curators.DataFrameCurator(invalid_df, schema)

    with pytest.raises(ln.errors.ValidationError) as err:
        curator.validate()
        message = "column 'transaction_amount_eur_cent' not in dataframe. Columns in dataframe: ['date', 'transaction_amount_usd_cent', 'currency_name']"
        assert message in str(err)


def test_invalid_label(transactions_schema: ln.Schema):
    """Test if validation fails for invalid currency"""
    # Create dataframe with invalid currency
    invalid_data = {
        "date": [datetime.date(2024, 1, 1)],
        "transaction_amount_usd_cent": [1000],
        "transaction_amount_eur_cent": [850],
        "currency_name": ["GBP"],  # Invalid currency not in our labels
        "receipt_url": ["https://bank.example/tx/1"],
    }
    invalid_df = pd.DataFrame(invalid_data)

    schema = ln.Schema.get(name="transaction_dataframe")
    curator = ln.curators.DataFrameCurator(invalid_df, schema)

    with pytest.raises(ln.errors.ValidationError):
        curator.validate()
    # exconly = """lamindb.errors.ValidationError: 1 term is not validated: 'GBP'
    # → fix typos, remove non-existent values, or save terms via .add_new_from("currency_name")"""
    # assert err.exconly() == exconly


def test_invalid_url_dtype(transactions_schema: ln.Schema):
    """Test if validation fails for non-string URL values."""
    invalid_data = {
        "date": [datetime.date(2024, 1, 1)],
        "transaction_amount_usd_cent": [1000],
        "transaction_amount_eur_cent": [850],
        "currency_name": ["USD"],
        "receipt_url": [123],  # URL is currently validated as string dtype
    }
    invalid_df = pd.DataFrame(invalid_data)

    schema = ln.Schema.get(name="transaction_dataframe")
    curator = ln.curators.DataFrameCurator(invalid_df, schema)

    with pytest.raises(ln.errors.ValidationError) as err:
        curator.validate()
    assert "receipt_url" in str(err.value)


================================================
FILE: tests/integrations/conftest.py
================================================
import shutil
from time import perf_counter

import lamindb_setup as ln_setup
import pytest


def pytest_sessionstart():
    t_execute_start = perf_counter()
    ln_setup.init(storage="./testdb-integrations")
    total_time_elapsed = perf_counter() - t_execute_start
    print(f"time to setup the instance: {total_time_elapsed:.1f}s")


def pytest_sessionfinish(session: pytest.Session):
    shutil.rmtree("./testdb-integrations")
    ln_setup.delete("testdb-integrations", force=True)


@pytest.fixture
def ccaplog(caplog):
    """Add caplog handler to our custom logger at session start."""
    from lamin_utils._logger import logger

    logger.addHandler(caplog.handler)

    yield caplog

    logger.removeHandler(caplog.handler)


================================================
FILE: tests/integrations/test_lightning.py
================================================
import json
import shutil
from pathlib import Path
from typing import Any, Generator, cast
from unittest.mock import MagicMock

import lamindb as ln
import lightning as pl
import pytest
import torch
from django.db import connection
from django.test.utils import CaptureQueriesContext
from lamindb.integrations import lightning as ll
from lamindb.models._feature_manager import FeatureManager
from torch import nn
from torch.utils.data import DataLoader, TensorDataset


@pytest.fixture(autouse=True)
def cleanup_checkpoints() -> Generator[None, None, None]:
    """Clean up checkpoint files and directories after each test."""
    yield
    checkpoints_dir = Path("checkpoints")
    if checkpoints_dir.exists():
        shutil.rmtree(checkpoints_dir)


@pytest.fixture(autouse=True, scope="session")
def cleanup_test_dir() -> Generator[None, None, None]:
    """Clean up test directory after all tests."""
    yield
    for dirname in ("lightning_checkpoints", "test_lightning", "lightning_logs"):
        dirpath = Path(dirname)
        if dirpath.exists():
            shutil.rmtree(dirpath)


@pytest.fixture
def simple_model() -> pl.LightningModule:
    class SimpleModel(pl.LightningModule):
        def __init__(self):
            super().__init__()
            self.layer = nn.Linear(10, 1)

        def forward(self, x):
            return self.layer(x)

        def training_step(self, batch, batch_idx):
            x, y = batch
            loss = nn.functional.mse_loss(self(x), y)
            self.log("train_loss", loss)
            return loss

        def configure_optimizers(self):
            return torch.optim.Adam(self.parameters())

    return SimpleModel()


@pytest.fixture
def dataloader() -> DataLoader:
    return DataLoader(
        TensorDataset(torch.randn(100, 10), torch.randn(100, 1)), batch_size=10
    )


@pytest.fixture
def dirpath(request: pytest.FixtureRequest) -> Generator[str, None, None]:
    prefix = f"lightning_checkpoints/{request.node.name}/"

    yield prefix

    for af in ln.Artifact.filter(key__startswith=prefix):
        af.delete(permanent=True, storage=True)
    dirpath_path = Path(prefix)
    if dirpath_path.exists():
        shutil.rmtree(dirpath_path)


@pytest.fixture(scope="session")
def lightning_features() -> Generator[None, None, None]:
    """Create lightning features."""
    ll.save_lightning_features()

    yield

    if lightning_type := ln.Feature.filter(name="lamindb.lightning").one_or_none():
        for feat in ln.Feature.filter(type=lightning_type):
            for af in ln.Artifact.filter(schemas__features=feat):
                af.delete(permanent=True, storage=True)
            # JSONValues are lingering and also need to be deleted
            ln.models.RunJsonValue.filter(jsonvalue__feature=feat).delete(
                permanent=True
            )
            ln.models.JsonValue.filter(feature=feat).delete(permanent=True)
            feat.delete(permanent=True)


def test_checkpoint_basic(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
):
    """Checkpoint should create artifacts with semantic paths."""
    callback = ll.Checkpoint(dirpath=dirpath, monitor="train_loss")
    trainer = pl.Trainer(
        max_epochs=2,
        callbacks=[callback],
        logger=False,
    )
    trainer.fit(simple_model, dataloader)

    prefix = callback.checkpoint_key_prefix + "/"
    artifacts = ln.Artifact.filter(key__startswith=prefix)
    assert len(artifacts) >= 1
    for af in artifacts:
        assert af.kind == "model"
        assert af.key.startswith(prefix)


def test_checkpoint_with_features(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
):
    """Checkpoint should annotate artifacts with feature values."""
    ln.Feature(name="train_loss", dtype=float).save()
    ln.Feature(name="custom_param", dtype=str).save()

    ln.track()

    callback = ll.Checkpoint(
        dirpath=dirpath,
        features={
            "artifact": {"train_loss": None},
            "run": {"custom_param": "test_value"},
        },
        monitor="train_loss",
    )
    trainer = pl.Trainer(
        max_epochs=2,
        callbacks=[callback],
        logger=False,
    )
    trainer.fit(simple_model, dataloader)

    prefix = callback.checkpoint_key_prefix + "/"
    artifacts = ln.Artifact.filter(key__startswith=prefix)
    assert len(artifacts) >= 1
    for af in artifacts:
        values = af.features.get_values()
        assert "train_loss" in values

    assert ln.context.run.features.get_values()["custom_param"] == "test_value"

    ln.finish()


def test_checkpoint_missing_features(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
):
    """Checkpoint should raise an error when specified features do not exist."""
    callback = ll.Checkpoint(
        dirpath=dirpath,
        features={"artifact": {"nonexistent_feature": None}},
        monitor="train_loss",
    )
    trainer = pl.Trainer(
        max_epochs=1,
        callbacks=[callback],
        logger=False,
    )

    with pytest.raises(ValueError, match="Feature nonexistent_feature missing"):
        trainer.fit(simple_model, dataloader)


def test_checkpoint_auto_features(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    lightning_features: None,
):
    """Checkpoint should auto-track lightning features if they exist."""
    callback = ll.Checkpoint(
        dirpath=dirpath,
        monitor="train_loss",
        save_top_k=2,
    )
    trainer = pl.Trainer(
        max_epochs=3,
        callbacks=[callback],
        logger=False,
    )
    trainer.fit(simple_model, dataloader)

    prefix = callback.checkpoint_key_prefix + "/"
    artifacts = ln.Artifact.filter(key__startswith=prefix)
    assert len(artifacts) >= 1

    for af in artifacts:
        values = af.features.get_values()
        assert "is_best_model" in values
        assert "is_last_model" in values
        assert "score" in values
        assert "model_rank" in values


def test_checkpoint_auto_features_with_duplicate_score_name(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    lightning_features: None,
):
    """Auto-tracking should work if a generic 'score' feature also exists."""
    ln.Feature(name="score", dtype=float).save()

    callback = ll.Checkpoint(
        dirpath=dirpath,
        monitor="train_loss",
        save_top_k=2,
    )
    trainer = pl.Trainer(
        max_epochs=1,
        callbacks=[callback],
        logger=False,
    )
    trainer.fit(simple_model, dataloader)

    prefix = callback.checkpoint_key_prefix + "/"
    artifacts = ln.Artifact.filter(key__startswith=prefix)
    assert len(artifacts) >= 1


def test_checkpoint_best_model_with_duplicate_feature_names(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    lightning_features: None,
):
    """Clearing best-model flags should work when duplicate feature names exist.

    Regression test: when a Feature named 'is_best_model' exists both under the
    lamindb.lightning type and without a type (or under a different type),
    remove_values used to call Feature.get(name=...) which raised
    MultipleObjectsReturned. The fix uses type-scoped Feature lookups.
    """
    # Create a duplicate 'is_best_model' feature without the lightning type
    ln.Feature(name="is_best_model", dtype=bool).save()

    callback = ll.Checkpoint(
        dirpath=dirpath,
        monitor="train_loss",
        save_top_k=2,
        mode="min",
    )
    trainer = pl.Trainer(
        max_epochs=3,
        callbacks=[callback],
        logger=False,
    )
    # This would raise MultipleObjectsReturned before the fix
    trainer.fit(simple_model, dataloader)

    prefix = callback.checkpoint_key_prefix + "/"
    artifacts = ln.Artifact.filter(key__startswith=prefix)
    assert len(artifacts) >= 1

    best_count = sum(
        1 for af in artifacts if af.features.get_values().get("is_best_model") is True
    )
    assert best_count == 1
    last_count = sum(
        1 for af in artifacts if af.features.get_values().get("is_last_model") is True
    )
    assert last_count == 1


def test_checkpoint_query_budget_scales_sublinearly_with_hparams(
    dataloader: DataLoader, dirpath: str, lightning_features: None
):
    """DB queries should not scale linearly with hparam count."""

    class ModelWithManyHparams(pl.LightningModule):
        def __init__(self, n_hparams: int):
            super().__init__()
            self.layer = nn.Linear(10, 1)
            self.save_hyperparameters({f"hp_{i}": i for i in range(n_hparams)})

        def forward(self, x):
            return self.layer(x)

        def training_step(self, batch, batch_idx):
            x, y = batch
            loss = nn.functional.mse_loss(self(x), y)
            self.log("train_loss", loss)
            return loss

        def configure_optimizers(self):
            return torch.optim.Adam(self.parameters())

    def count_fit_queries(n_hparams: int) -> int:
        model = ModelWithManyHparams(n_hparams)
        callback = ll.Checkpoint(
            dirpath=f"{dirpath.rstrip('/')}/{n_hparams}/", monitor="train_loss"
        )
        trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False)
        with CaptureQueriesContext(connection) as ctx:
            trainer.fit(model, dataloader)
        return len(ctx.captured_queries)

    low_hparams_queries = count_fit_queries(2)
    high_hparams_queries = count_fit_queries(40)
    assert high_hparams_queries <= low_hparams_queries + 10


def test_model_rank_update_query_budget(
    dirpath: str,
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
    lightning_features: None,
):
    """Ranking should use batched feature reads."""
    callback = ll.Checkpoint(dirpath=dirpath, monitor="train_loss", mode="min")
    # Provide a stub trainer so checkpoint_key_prefix can compute on-the-fly.
    # Only _original_dirpath matters for key derivation here.
    stub_trainer = MagicMock(spec=pl.Trainer)
    stub_trainer.loggers = []
    callback._trainer = stub_trainer
    key_prefix = callback.checkpoint_key_prefix
    created_artifacts = []

    for i in range(8):
        model_file = tmp_path / f"model_{i}.ckpt"
        model_file.write_bytes(f"checkpoint-{i}".encode())
        artifact = ln.Artifact(
            model_file, key=f"{key_prefix}/model_{i}.ckpt", kind="model"
        )
        artifact.save()
        artifact.features.add_values({"score": float(i), "model_rank": i})
        created_artifacts.append(artifact)

    monkeypatch.setattr(FeatureManager, "remove_values", lambda *args, **kwargs: None)
    monkeypatch.setattr(FeatureManager, "add_values", lambda *args, **kwargs: None)

    with CaptureQueriesContext(connection) as ctx:
        callback._feature_annotator.update_model_ranks(key_prefix, mode="min")
    assert len(ctx.captured_queries) <= 6

    for artifact in created_artifacts:
        artifact.delete(permanent=True, storage=True)


def test_checkpoint_best_model_tracking(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    lightning_features: None,
):
    """Only one checkpoint should be marked as best model."""
    callback = ll.Checkpoint(
        dirpath=dirpath,
        monitor="train_loss",
        save_top_k=3,
        mode="min",
    )
    trainer = pl.Trainer(
        max_epochs=3,
        callbacks=[callback],
        logger=False,
    )
    trainer.fit(simple_model, dataloader)

    prefix = callback.checkpoint_key_prefix + "/"
    artifacts = ln.Artifact.filter(key__startswith=prefix)
    best_count = sum(
        1 for af in artifacts if af.features.get_values().get("is_best_model") is True
    )
    assert best_count == 1


def test_checkpoint_model_rank(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    lightning_features: None,
):
    """Checkpoints should have correct model_rank (0 = best)."""
    callback = ll.Checkpoint(
        dirpath=dirpath,
        monitor="train_loss",
        save_top_k=3,
        mode="min",
    )
    trainer = pl.Trainer(
        max_epochs=3,
        callbacks=[callback],
        logger=False,
    )
    trainer.fit(simple_model, dataloader)

    prefix = callback.checkpoint_key_prefix + "/"
    artifacts = ln.Artifact.filter(key__startswith=prefix)
    ranks = [af.features.get_values().get("model_rank") for af in artifacts]
    assert 0 in ranks  # best model has rank 0
    last_count = sum(
        1 for af in artifacts if af.features.get_values().get("is_last_model") is True
    )
    assert last_count == 1


def test_checkpoint_last_model_points_to_last_saved_artifact(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    lightning_features: None,
):
    """The artifact flagged as last model should be the last saved checkpoint artifact."""
    checkpoint = ll.Checkpoint(
        dirpath=dirpath,
        monitor="train_loss",
        save_top_k=3,
        mode="min",
    )
    trainer = pl.Trainer(
        max_epochs=3,
        callbacks=[checkpoint],
        logger=False,
    )
    trainer.fit(simple_model, dataloader)

    prefix = checkpoint.checkpoint_key_prefix + "/"
    artifacts = list(ln.Artifact.filter(key__startswith=prefix))
    last_artifacts = [
        artifact
        for artifact in artifacts
        if artifact.features.get_values().get("is_last_model") is True
    ]

    assert len(last_artifacts) == 1
    assert checkpoint.last_checkpoint_artifact is not None
    assert last_artifacts[0].id == checkpoint.last_checkpoint_artifact.id


def test_checkpoint_semantic_paths(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    lightning_features: None,
):
    """Checkpoints should have semantic keys derived from dirpath."""
    callback = ll.Checkpoint(
        dirpath=dirpath,
        monitor="train_loss",
        save_top_k=3,
    )
    trainer = pl.Trainer(
        max_epochs=3,
        callbacks=[callback],
        logger=False,
    )
    trainer.fit(simple_model, dataloader)

    prefix = callback.checkpoint_key_prefix + "/"
    artifacts = ln.Artifact.filter(key__startswith=prefix)
    assert len(artifacts) >= 1

    for af in artifacts:
        assert af.key.startswith(prefix)
        values = af.features.get_values()
        assert "is_best_model" in values
        assert "score" in values


def test_callback_deprecated(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    tmp_path: Path,
):
    """Deprecated Callback should still work."""
    key = f"test/legacy/{tmp_path.name}/model.ckpt"
    path = tmp_path / "model.ckpt"

    with pytest.warns(DeprecationWarning, match="use ll.Checkpoint instead"):
        callback = ll.Callback(path=path, key=key)

    trainer = pl.Trainer(
        max_epochs=1,
        callbacks=[callback],
        logger=False,
    )
    trainer.fit(simple_model, dataloader)

    artifacts = ln.Artifact.filter(key=key)
    assert len(artifacts) >= 1
    assert artifacts[0].kind == "model"

    # cleanup
    for af in artifacts:
        af.delete(permanent=True, storage=True)


def test_checkpoint_overwrites_existing_artifact(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
):
    """Checkpoint with same key should transparently replace the existing artifact."""
    dummy = tmp_path / "dummy.ckpt"
    dummy.write_bytes(b"dummy")
    fixed_key = f"{dirpath.rstrip('/')}/fixed.ckpt"
    ln.Artifact(dummy, key=fixed_key).save()
    old_uid = ln.Artifact.filter(key=fixed_key).one().uid

    callback = ll.Checkpoint(dirpath=dirpath)
    monkeypatch.setattr(callback, "resolve_artifact_key", lambda **kwargs: fixed_key)
    trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False)
    trainer.fit(simple_model, dataloader)

    new_artifact = ln.Artifact.filter(key=fixed_key).one()
    assert new_artifact.uid != old_uid

    for af in ln.Artifact.filter(key=fixed_key):
        af.delete(permanent=True, storage=True)


def test_checkpoint_invalid_feature_keys(dirpath: str):
    """Checkpoint should raise on invalid feature keys."""
    with pytest.raises(ValueError, match="Invalid feature keys"):
        ll.Checkpoint(
            dirpath=dirpath,
            features={"invalid_key": {"foo": "bar"}},  # type: ignore
        )


def test_checkpoint_hparams(dataloader: DataLoader, dirpath: str, lightning_features):
    """Checkpoint should auto-capture model hparams if features exist."""

    class ModelWithHparams(pl.LightningModule):
        def __init__(self, hidden_size: int = 32, learning_rate: float = 0.001):
            super().__init__()
            self.save_hyperparameters()
            self.layer = nn.Linear(10, hidden_size)
            self.out = nn.Linear(hidden_size, 1)

        def forward(self, x):
            return self.out(torch.relu(self.layer(x)))

        def training_step(self, batch, batch_idx):
            x, y = batch
            loss = nn.functional.mse_loss(self(x), y)
            self.log("train_loss", loss)
            return loss

        def configure_optimizers(self):
            return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)

    ln.Feature(name="hidden_size", dtype=int).save()
    ln.Feature(name="learning_rate", dtype=float).save()

    ln.track()

    model = ModelWithHparams(hidden_size=64, learning_rate=0.01)
    callback = ll.Checkpoint(dirpath=dirpath, monitor="train_loss")
    trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False)
    trainer.fit(model, dataloader)

    run = ln.context.run
    run_features = run.features.get_values()
    assert run_features["hidden_size"] == 64
    assert run_features["learning_rate"] == 0.01

    ln.finish()


def test_checkpoint_datamodule_hparams(
    simple_model: pl.LightningModule, dirpath: str, lightning_features
):
    """Checkpoint should auto-capture datamodule hparams if features exist."""

    class DataModuleWithHparams(pl.LightningDataModule):
        def __init__(self, batch_size: int = 32, num_workers: int = 4):
            super().__init__()
            self.save_hyperparameters()

        def train_dataloader(self):
            return DataLoader(
                TensorDataset(torch.randn(100, 10), torch.randn(100, 1)),
                batch_size=self.hparams.batch_size,
            )

    ln.Feature(name="batch_size", dtype=int).save()
    ln.Feature(name="num_workers", dtype=int).save()

    ln.track()

    datamodule = DataModuleWithHparams(batch_size=16, num_workers=2)
    callback = ll.Checkpoint(dirpath=dirpath, monitor="train_loss")
    trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False)
    trainer.fit(simple_model, datamodule=datamodule)

    run = ln.context.run
    run_features = run.features.get_values()
    assert run_features["batch_size"] == 16
    assert run_features["num_workers"] == 2

    ln.finish()


def test_checkpoint_trainer_config(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    lightning_features: None,
):
    """Checkpoint should auto-capture trainer config if features exist."""
    ln.track()

    callback = ll.Checkpoint(
        dirpath=dirpath,
        monitor="train_loss",
        save_weights_only=True,
        mode="min",
    )
    trainer = pl.Trainer(
        max_epochs=5,
        max_steps=100,
        precision="32",
        accumulate_grad_batches=2,
        gradient_clip_val=0.5,
        callbacks=[callback],
        logger=False,
    )
    trainer.fit(simple_model, dataloader)

    run_features = ln.context.run.features.get_values()
    artifacts = ln.Artifact.filter(key__startswith=callback.checkpoint_key_prefix + "/")
    assert run_features["max_epochs"] == 5
    assert run_features["max_steps"] == 100
    assert run_features["precision"] == "32-true"
    assert run_features["accumulate_grad_batches"] == 2
    assert run_features["gradient_clip_val"] == 0.5
    assert run_features["monitor"] == "train_loss"
    assert run_features["mode"] == "min"
    assert "save_weights_only" not in run_features

    assert len(artifacts) >= 1
    for artifact in artifacts:
        artifact_features = artifact.features.get_values()
        assert artifact_features["save_weights_only"] is True
        assert artifact_features["monitor"] == "train_loss"
        assert artifact_features["mode"] == "min"

    ln.finish()


def test_checkpoint_hparams_yaml_with_hparams(
    dataloader: DataLoader,
    dirpath: str,
    tmp_path: Path,
):
    """Checkpoint should save hparams.yaml when model has hyperparameters."""
    from lightning.pytorch.loggers import CSVLogger

    class ModelWithHparams(pl.LightningModule):
        def __init__(self, hidden_size: int = 32):
            super().__init__()
            self.save_hyperparameters()
            self.layer = nn.Linear(10, hidden_size)
            self.out = nn.Linear(hidden_size, 1)

        def forward(self, x):
            return self.out(torch.relu(self.layer(x)))

        def training_step(self, batch, batch_idx):
            x, y = batch
            loss = nn.functional.mse_loss(self(x), y)
            self.log("train_loss", loss)
            return loss

        def configure_optimizers(self):
            return torch.optim.Adam(self.parameters())

    logger = CSVLogger(save_dir=tmp_path, name="test_logs")

    model = ModelWithHparams(hidden_size=64)
    callback = ll.Checkpoint(dirpath=dirpath, monitor="train_loss")
    trainer = pl.Trainer(
        max_epochs=1,
        callbacks=[callback],
        logger=logger,
    )
    trainer.fit(model, dataloader)

    resolved_dirpath = callback.checkpoint_key_prefix
    hparams_key = f"{resolved_dirpath}/hparams.yaml"
    hparams_artifact = ln.Artifact.filter(key=hparams_key).one_or_none()

    assert hparams_artifact is not None
    assert hparams_artifact.description == "Lightning run hyperparameters"

    # cleanup
    hparams_artifact.delete(permanent=True)
    shutil.rmtree(tmp_path / "test_logs", ignore_errors=True)


@pytest.mark.parametrize(
    ("use_dirpath", "use_logger"),
    [
        (True, True),
        (False, True),
        (True, False),
        (False, False),
    ],
    ids=[
        "dirpath-logger",
        "no-dirpath-logger",
        "dirpath-no-logger",
        "no-dirpath-no-logger",
    ],
)
def test_key_layout_matrix(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    tmp_path: Path,
    use_dirpath: bool,
    use_logger: bool,
):
    """Artifact keys must follow the base-prefix layout across all 4 configurations.

    With ``run_uid_is_version=True`` and an active Lamin run, the expected
    key layout is::

        {base}/checkpoints/{ckpt_filename}
        {base}/config.yaml              (when SaveConfigCallback is used)
        {base}/checkpoints/hparams.yaml (when model has hyperparameters)

    Where ``base`` is determined by:

    ==============================  ==================================
    Scenario                        Base prefix
    ==============================  ==================================
    dirpath set (± logger)          ``{dirpath}/{run_uid}``
    no dirpath + logger             ``{save_dir_name}/{name}/{run_uid}``
    no dirpath + no logger          ``{run_uid}``
    ==============================  ==================================
    """
    from lightning.pytorch.loggers import CSVLogger

    class ParserStub:
        def save(self, config, path, skip_none, overwrite, multifile):
            del skip_none, overwrite, multifile
            Path(path).write_text(json.dumps(config, indent=2))

    dirpath = str(tmp_path / "layout_test")

    ln.track()
    run_uid = ln.context.run.uid

    logger: CSVLogger | bool
    logger_name = "layout_exp"
    if use_logger:
        logger = CSVLogger(save_dir=tmp_path, name=logger_name)
    else:
        logger = False

    checkpoint = ll.Checkpoint(
        dirpath=dirpath if use_dirpath else None,
        monitor="train_loss",
        run_uid_is_version=True,
    )
    config = {"trainer": {"max_epochs": 1}}
    save_config = ll.SaveConfigCallback(
        parser=cast(Any, ParserStub()),
        config=config,
        config_filename="config.yaml",
    )

    trainer = pl.Trainer(
        max_epochs=1,
        callbacks=[checkpoint, save_config],
        logger=logger,
        default_root_dir=tmp_path,
    )
    trainer.fit(simple_model, dataloader)

    # Determine expected base prefix
    if use_dirpath:
        expected_base = f"{dirpath.rstrip('/')}/{run_uid}"
    elif use_logger:
        expected_base = f"{tmp_path.name}/{logger_name}/{run_uid}"
    else:
        expected_base = run_uid

    # Verify base_prefix
    assert checkpoint.base_prefix == expected_base

    # Verify checkpoint key prefix
    expected_ckpt_prefix = f"{expected_base}/checkpoints"
    assert checkpoint.checkpoint_key_prefix == expected_ckpt_prefix

    # Verify checkpoint artifacts exist under the correct prefix
    ckpt_artifacts = ln.Artifact.filter(key__startswith=expected_ckpt_prefix + "/")
    assert len(ckpt_artifacts) >= 1
    for af in ckpt_artifacts:
        assert af.key.startswith(expected_ckpt_prefix + "/")

    # Verify config artifact sits directly under the base prefix
    expected_config_key = f"{expected_base}/config.yaml"
    config_artifact = ln.Artifact.filter(key=expected_config_key).one_or_none()
    assert config_artifact is not None, f"Expected config at {expected_config_key}"

    # Cleanup
    json_values = ln.models.JsonValue.filter(links_artifact__artifact=config_artifact)
    ln.models.ArtifactJsonValue.filter(artifact=config_artifact).delete()
    config_artifact.delete(permanent=True, storage=True)
    json_values.delete(permanent=True)
    for af in ckpt_artifacts:
        af.delete(permanent=True, storage=True)
    ln.finish()
    if use_logger:
        shutil.rmtree(tmp_path / logger_name, ignore_errors=True)


def test_run_uid_not_in_key_when_disabled(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    tmp_path: Path,
):
    """With run_uid_is_version=False, the key should use the logger version as before."""
    from lightning.pytorch.loggers import CSVLogger

    ln.track()

    logger = CSVLogger(save_dir=tmp_path, name="no_uid_test")
    callback = ll.Checkpoint(monitor="train_loss", run_uid_is_version=False)
    trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=logger)
    trainer.fit(simple_model, dataloader)

    prefix = callback.checkpoint_key_prefix
    assert "version_0" in prefix
    assert prefix == f"{tmp_path.name}/no_uid_test/version_0/checkpoints"

    artifacts = ln.Artifact.filter(key__startswith=prefix + "/")
    assert len(artifacts) >= 1

    for af in artifacts:
        af.delete(permanent=True, storage=True)
    ln.finish()
    shutil.rmtree(tmp_path / "no_uid_test", ignore_errors=True)


def test_two_runs_same_logger_produce_different_keys(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    tmp_path: Path,
):
    """Two tracked runs with the same logger config should not collide on keys."""
    from lightning.pytorch.loggers import CSVLogger

    prefixes = []
    for _ in range(2):
        ln.track()
        logger = CSVLogger(save_dir=tmp_path, name="collision_test")
        callback = ll.Checkpoint(monitor="train_loss", run_uid_is_version=True)
        trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=logger)
        trainer.fit(simple_model, dataloader)
        prefixes.append(callback.checkpoint_key_prefix)
        ln.finish()

    assert prefixes[0] != prefixes[1], "Two runs should produce different key prefixes"

    for prefix in prefixes:
        for af in ln.Artifact.filter(key__startswith=prefix + "/"):
            af.delete(permanent=True, storage=True)
    shutil.rmtree(tmp_path / "collision_test", ignore_errors=True)


@pytest.mark.parametrize(
    ("use_dirpath", "logger_name", "key_source"),
    [
        (False, "my_experiment", "logger"),
        (False, None, "checkpoints"),
        (True, "should_not_appear", "dirpath"),
        (True, None, "dirpath"),
    ],
    ids=[
        "without-dirpath-with-logger",
        "without-dirpath-without-logger",
        "with-dirpath-with-logger",
        "with-dirpath-without-logger",
    ],
)
def test_checkpoint_artifact_key_prefix_matrix(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    tmp_path: Path,
    use_dirpath: bool,
    logger_name: str | None,
    key_source: str,
):
    """Checkpoint artifact keys should match the dirpath/logger configuration matrix."""
    from lightning.pytorch.loggers import CSVLogger

    logger: CSVLogger | bool
    if logger_name is None:
        logger = False
    else:
        logger = CSVLogger(save_dir=tmp_path, name=logger_name)

    callback = ll.Checkpoint(
        dirpath=dirpath if use_dirpath else None,
        monitor="train_loss",
    )
    trainer = pl.Trainer(
        max_epochs=2,
        callbacks=[callback],
        logger=logger,
    )
    trainer.fit(simple_model, dataloader)

    prefix = callback.checkpoint_key_prefix
    if key_source == "logger":
        assert prefix == f"{tmp_path.name}/{logger_name}/version_0/checkpoints"
    elif key_source == "checkpoints":
        assert prefix == "checkpoints"
    else:
        assert prefix == f"{dirpath.rstrip('/')}/checkpoints"
        if logger_name is not None:
            assert logger_name not in prefix

    artifacts = ln.Artifact.filter(key__startswith=prefix + "/")
    assert len(artifacts) >= 1
    for af in artifacts:
        assert af.kind == "model"
        assert af.key.startswith(prefix + "/")

    if not use_dirpath:
        for af in artifacts:
            af.delete(permanent=True, storage=True)

    if logger_name is not None:
        shutil.rmtree(tmp_path / logger_name, ignore_errors=True)


def test_checkpoint_auto_features_without_dirpath(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    tmp_path: Path,
    lightning_features: None,
):
    """Auto-features (best model, score, rank) should work without dirpath."""
    from lightning.pytorch.loggers import CSVLogger

    logger = CSVLogger(save_dir=tmp_path, name="auto_feat")

    callback = ll.Checkpoint(
        monitor="train_loss",
        save_top_k=2,
        mode="min",
    )
    trainer = pl.Trainer(
        max_epochs=3,
        callbacks=[callback],
        logger=logger,
    )
    trainer.fit(simple_model, dataloader)

    prefix = callback.checkpoint_key_prefix
    artifacts = ln.Artifact.filter(key__startswith=prefix + "/")
    assert len(artifacts) >= 1

    for af in artifacts:
        values = af.features.get_values()
        assert "is_best_model" in values
        assert "score" in values
        assert "model_rank" in values

    best_count = sum(
        1 for af in artifacts if af.features.get_values().get("is_best_model") is True
    )
    assert best_count == 1

    ranks = [af.features.get_values().get("model_rank") for af in artifacts]
    assert 0 in ranks

    # cleanup
    for af in artifacts:
        af.delete(permanent=True, storage=True)
    shutil.rmtree(tmp_path / "auto_feat", ignore_errors=True)


@pytest.mark.parametrize(
    ("use_dirpath", "logger_name", "key_source"),
    [
        (False, "cli_logs", "logger"),
        (False, None, "filename"),
        (True, "cli_logs", "dirpath"),
        (True, None, "dirpath"),
    ],
    ids=[
        "without-dirpath-with-logger",
        "without-dirpath-without-logger",
        "with-dirpath-with-logger",
        "with-dirpath-without-logger",
    ],
)
def test_save_config_artifact_key_matrix(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    tmp_path: Path,
    use_dirpath: bool,
    logger_name: str | None,
    key_source: str,
):
    """Config artifacts should be stored under the base prefix (dirpath > logger > empty)."""
    from lightning.pytorch.loggers import CSVLogger

    class ParserStub:
        def save(
            self,
            config,
            path,
            skip_none: bool,
            overwrite: bool,
            multifile: bool,
        ) -> None:
            del skip_none, overwrite, multifile
            Path(path).write_text(json.dumps(config, indent=2))

    logger: CSVLogger | bool
    if logger_name is None:
        logger = False
    else:
        logger = CSVLogger(save_dir=tmp_path, name=logger_name)

    checkpoint = ll.Checkpoint(
        dirpath=dirpath if use_dirpath else None,
        monitor="train_loss",
    )
    config = {"trainer": {"max_epochs": 1}, "model": {"hidden_size": 1}}
    save_config = ll.SaveConfigCallback(
        parser=cast(Any, ParserStub()),
        config=config,
        config_filename="config.yaml",
    )

    trainer = pl.Trainer(
        max_epochs=1,
        callbacks=[checkpoint, save_config],
        logger=logger,
        default_root_dir=tmp_path,
    )
    trainer.fit(simple_model, dataloader)

    assert trainer.log_dir is not None
    local_config_path = Path(trainer.log_dir) / "config.yaml"
    assert local_config_path.exists()
    assert "max_epochs" in local_config_path.read_text()
    if use_dirpath:
        assert dirpath.rstrip("/") not in str(local_config_path)

    if key_source == "logger":
        assert logger_name is not None
        config_key = f"{tmp_path.name}/{logger_name}/version_0/config.yaml"
    elif key_source == "dirpath":
        config_key = f"{dirpath.rstrip('/')}/config.yaml"
    else:
        config_key = "config.yaml"
    config_artifact = ln.Artifact.filter(key=config_key).one_or_none()
    assert config_artifact is not None
    assert config_artifact.description == "Lightning CLI config"

    checkpoint_artifacts = ln.Artifact.filter(
        key__startswith=checkpoint.checkpoint_key_prefix + "/"
    )
    assert len(checkpoint_artifacts) >= 1

    json_values = ln.models.JsonValue.filter(links_artifact__artifact=config_artifact)
    ln.models.ArtifactJsonValue.filter(artifact=config_artifact).delete()
    config_artifact.delete(permanent=True, storage=True)
    json_values.delete(permanent=True)
    for artifact in checkpoint_artifacts:
        artifact.delete(permanent=True, storage=True)
    shutil.rmtree(tmp_path / "cli_logs", ignore_errors=True)


def test_save_config_artifact_tracked_as_run_input(
    simple_model: pl.LightningModule,
    dataloader: DataLoader,
    dirpath: str,
    tmp_path: Path,
):
    """Config artifacts should be tracked as run inputs while checkpoints stay outputs."""

    class ParserStub:
        def save(
            self,
            config,
            path,
            skip_none: bool,
            overwrite: bool,
            multifile: bool,
        ) -> None:
            del skip_none, overwrite, multifile
            Path(path).write_text(json.dumps(config, indent=2))

    ln.track()

    checkpoint = ll.Checkpoint(dirpath=dirpath, monitor="train_loss")
    save_config = ll.SaveConfigCallback(
        parser=cast(Any, ParserStub()),
        config={"trainer": {"max_epochs": 1}},
        config_filename="config.yaml",
    )
    trainer = pl.Trainer(
        max_epochs=1,
        callbacks=[checkpoint, save_config],
        logger=False,
        default_root_dir=tmp_path,
    )
    trainer.fit(simple_model, dataloader)

    run = ln.context.run
    assert run is not None
    assert checkpoint.last_config_artifact is not None
    assert checkpoint.last_checkpoint_artifact is not None

    config_artifact = checkpoint.last_config_artifact
    checkpoint_artifact = checkpoint.last_checkpoint_artifact

    assert config_artifact.run is None
    assert run in config_artifact.input_of_runs.all()

    assert checkpoint_artifact.run == run
    assert checkpoint_artifact.input_of_runs.count() == 0

    config_artifact.delete(permanent=True, storage=True)
    checkpoint_artifact.delete(permanent=True, storage=True)
    ln.finish()


def test_checkpoint_subclass_receives_artifact_events(
    dataloader: DataLoader,
    dirpath: str,
    tmp_path: Path,
):
    """Subclass hooks should receive checkpoint, config, and hparams artifacts."""
    from lightning.pytorch.loggers import CSVLogger

    class ModelWithHparams(pl.LightningModule):
        def __init__(self, hidden_size: int = 32):
            super().__init__()
            self.save_hyperparameters()
            self.layer = nn.Linear(10, hidden_size)
            self.out = nn.Linear(hidden_size, 1)

        def forward(self, x):
            return self.out(torch.relu(self.layer(x)))

        def training_step(self, batch, batch_idx):
            x, y = batch
            loss = nn.functional.mse_loss(self(x), y)
            self.log("train_loss", loss)
            return loss

        def configure_optimizers(self):
            return torch.optim.Adam(self.parameters())

    class ParserStub:
        def save(
            self,
            config,
            path,
            skip_none: bool,
            overwrite: bool,
            multifile: bool,
        ) -> None:
            del skip_none, overwrite, multifile
            Path(path).write_text(json.dumps(config, indent=2))

    class RecordingCheckpoint(ll.Checkpoint):
        def __init__(self, **kwargs):
            super().__init__(**kwargs)
            self.saved_events: list[ll.ArtifactSavedEvent] = []

        def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None:
            self.saved_events.append(event)

    logger = CSVLogger(save_dir=tmp_path, name="recording_logs")
    checkpoint = RecordingCheckpoint(dirpath=dirpath, monitor="train_loss")
    save_config = ll.SaveConfigCallback(
        parser=cast(Any, ParserStub()),
        config={"trainer": {"max_epochs": 1}},
        config_filename="config.yaml",
    )
    trainer = pl.Trainer(
        max_epochs=1,
        callbacks=[checkpoint, save_config],
        logger=logger,
        default_root_dir=tmp_path,
    )
    trainer.fit(ModelWithHparams(), dataloader)

    assert {event.kind for event in checkpoint.saved_events} >= {
        "checkpoint",
        "config",
        "hparams",
    }
    assert checkpoint.last_checkpoint_artifact is not None
    assert checkpoint.last_config_artifact is not None
    assert checkpoint.last_hparams_artifact is not None
    assert checkpoint.last_checkpoint_artifact.key.startswith(
        checkpoint.checkpoint_key_prefix + "/"
    )
    assert checkpoint.last_config_artifact.key.endswith("/config.yaml")
    assert checkpoint.last_hparams_artifact.key == (
        f"{checkpoint.checkpoint_key_prefix}/hparams.yaml"
    )
    checkpoint_event = next(
        event for event in checkpoint.saved_events if event.kind == "checkpoint"
    )
    assert checkpoint_event.key.startswith(checkpoint.checkpoint_key_prefix + "/")
    assert checkpoint_event.storage_uri == checkpoint.resolve_artifact_storage_uri(
        checkpoint_event.artifact
    )
    assert checkpoint_event.storage_uri.endswith(".ckpt")

    artifacts_by_key = {event.key: event.artifact for event in checkpoint.saved_events}
    for artifact in artifacts_by_key.values():
        ln.models.ArtifactJsonValue.filter(artifact=artifact).delete()
        ln.models.JsonValue.filter(links_artifact__artifact=artifact).delete(
            permanent=True
        )
        artifact.delete(permanent=True, storage=True)
    shutil.rmtree(tmp_path / "recording_logs", ignore_errors=True)


def test_checkpoint_artifact_observers_receive_shared_events(
    dataloader: DataLoader,
    dirpath: str,
    tmp_path: Path,
):
    """Observers should see the same checkpoint/config/hparams events as subclasses."""
    from lightning.pytorch.loggers import CSVLogger

    class ModelWithHparams(pl.LightningModule):
        def __init__(self, hidden_size: int = 32):
            super().__init__()
            self.save_hyperparameters()
            self.layer = nn.Linear(10, hidden_size)
            self.out = nn.Linear(hidden_size, 1)

        def forward(self, x):
            return self.out(torch.relu(self.layer(x)))

        def training_step(self, batch, batch_idx):
            x, y = batch
            loss = nn.functional.mse_loss(self(x), y)
            self.log("train_loss", loss)
            return loss

        def configure_optimizers(self):
            return torch.optim.Adam(self.parameters())

    class ParserStub:
        def save(
            self,
            config,
            path,
            skip_none: bool,
            overwrite: bool,
            multifile: bool,
        ) -> None:
            del skip_none, overwrite, multifile
            Path(path).write_text(json.dumps(config, indent=2))

    class RecordingObserver:
        def __init__(self):
            self.saved_events: list[ll.ArtifactSavedEvent] = []

        def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None:
            self.saved_events.append(event)

        def on_artifact_removed(self, event: ll.ArtifactRemovedEvent) -> None:
            del event

    observer = RecordingObserver()
    logger = CSVLogger(save_dir=tmp_path, name="observer_logs")
    checkpoint = ll.Checkpoint(
        dirpath=dirpath,
        monitor="train_loss",
        artifact_observers=[observer],
    )
    save_config = ll.SaveConfigCallback(
        parser=cast(Any, ParserStub()),
        config={"trainer": {"max_epochs": 1}},
        config_filename="config.yaml",
    )
    trainer = pl.Trainer(
        max_epochs=1,
        callbacks=[checkpoint, save_config],
        logger=logger,
        default_root_dir=tmp_path,
    )
    trainer.fit(ModelWithHparams(), dataloader)

    assert {event.kind for event in observer.saved_events} >= {
        "checkpoint",
        "config",
        "hparams",
    }
    checkpoint_event = next(
        event for event in observer.saved_events if event.kind == "checkpoint"
    )
    assert checkpoint_event.key.startswith(checkpoint.checkpoint_key_prefix + "/")
    assert checkpoint_event.local_path.name.endswith(".ckpt")
    assert checkpoint_event.storage_uri == checkpoint.resolve_artifact_storage_uri(
        checkpoint_event.artifact
    )
    assert checkpoint.last_artifact_event is not None
    assert checkpoint.get_last_artifact("config") == checkpoint.last_config_artifact

    artifacts_by_key = {event.key: event.artifact for event in observer.saved_events}
    for artifact in artifacts_by_key.values():
        ln.models.ArtifactJsonValue.filter(artifact=artifact).delete()
        ln.models.JsonValue.filter(links_artifact__artifact=artifact).delete(
            permanent=True
        )
        artifact.delete(permanent=True, storage=True)
    shutil.rmtree(tmp_path / "observer_logs", ignore_errors=True)


================================================
FILE: tests/no_instance/conftest.py
================================================
import pytest


@pytest.fixture
def ccaplog(caplog) -> pytest.LogCaptureFixture:
    """Add caplog handler to our custom logger at session start."""
    from lamin_utils._logger import logger

    logger.addHandler(caplog.handler)

    yield caplog

    logger.removeHandler(caplog.handler)


================================================
FILE: tests/no_instance/test_connect_dynamic_import.py
================================================
def test_connect_dynamic_import(ccaplog):
    import lamindb as ln

    # this only currently works if not instance was configured in the environment
    # in all other cases, we still trigger a reset_django() and hence django variables
    # become stale in case of a dynamic import
    assert ln.setup.settings.instance.slug == "none/none"

    ln.connect("laminlabs/lamin-site-assets")
    assert "connected in read-only mode" in ccaplog.text
    assert ln.Artifact.filter(key__startswith="blog").count() > 0
    ln.setup.disconnect()


================================================
FILE: tests/no_instance/test_import_side_effects.py
================================================
import importlib.util
import json
import os
import subprocess
import sys
from pathlib import Path

import pytest

REPO_ROOT = Path(__file__).resolve().parents[2]
MODULE_NAMES = ("anndata", "h5py", "pyarrow")
LIGHT_IMPORTS = {name: False for name in MODULE_NAMES}


PROBE_CASES = [
    (
        "storage package constants stay light",
        "import lamindb.core.storage as storage\n_ = storage.VALID_SUFFIXES\n_ = storage.delete_storage\n_ = storage.infer_filesystem",
        LIGHT_IMPORTS,
        (),
    ),
    (
        "storage object helpers stay light",
        "import lamindb.core.storage as storage\n_ = storage.infer_suffix\n_ = storage.write_to_disk",
        LIGHT_IMPORTS,
        (),
    ),
    (
        "loaders basic helpers stay light",
        "import lamindb.core.loaders as loaders\n_ = loaders.load_json\n_ = loaders.load_txt\n_ = loaders.load_html",
        LIGHT_IMPORTS,
        (),
    ),
    (
        "loaders tabular helpers stay light",
        "import lamindb.core.loaders as loaders\n_ = loaders.load_csv\n_ = loaders.load_parquet\n_ = loaders.load_tsv",
        LIGHT_IMPORTS,
        (),
    ),
    (
        "loaders optional-format helpers stay light",
        "import lamindb.core.loaders as loaders\n_ = loaders.load_h5ad\n_ = loaders.load_h5mu\n_ = loaders.load_zarr",
        LIGHT_IMPORTS,
        (),
    ),
    (
        "backed_access symbols stay light",
        "from lamindb.core.storage._backed_access import BackedAccessor, backed_access, _open_dataframe\n_ = BackedAccessor\n_ = backed_access\n_ = _open_dataframe",
        LIGHT_IMPORTS,
        (),
    ),
    (
        "objects module import stays light",
        "from lamindb.core.storage.objects import infer_suffix, write_to_disk\n_ = infer_suffix\n_ = write_to_disk",
        LIGHT_IMPORTS,
        (),
    ),
    (
        "backed_access pyarrow dataframe path stays anndata-free",
        "from upath import UPath\nimport pyarrow as pa\nimport pyarrow.parquet as pq\nfrom lamindb.core.storage._backed_access import backed_access\npath = UPath('test_import_side_effects.parquet')\npq.write_table(pa.table({'col': [1]}), path.as_posix())\ntry:\n    _ = backed_access(path, engine='pyarrow')\nfinally:\n    if path.exists():\n        path.unlink()",
        {"anndata": False, "h5py": False, "pyarrow": True},
        ("pyarrow",),
    ),
    (
        "backed_access polars dataframe path stays light",
        "from upath import UPath\nfrom lamindb.core.storage._backed_access import backed_access\npath = UPath('test_import_side_effects.csv')\nwith path.open('w') as f:\n    _ = f.write('col\\n1\\n')\ntry:\n    _ = backed_access(path, engine='polars')\nfinally:\n    if path.exists():\n        path.unlink()",
        LIGHT_IMPORTS,
        ("polars",),
    ),
]


def _probe_modules_loaded(code: str) -> dict[str, bool]:
    env = os.environ.copy()
    pythonpath = env.get("PYTHONPATH")
    env["PYTHONPATH"] = (
        str(REPO_ROOT)
        if not pythonpath
        else os.pathsep.join([str(REPO_ROOT), pythonpath])
    )
    probe_lines = [
        "import json",
        "import sys",
        "",
        f"module_names = {MODULE_NAMES!r}",
        "result = {name: (name in sys.modules) for name in module_names}",
        code,
        'result.update({f"{name}_after": (name in sys.modules) for name in module_names})',
        "print(json.dumps(result))",
    ]
    probe = "\n".join(probe_lines)
    completed = subprocess.run(
        [sys.executable, "-c", probe],
        check=True,
        capture_output=True,
        cwd=REPO_ROOT,
        env=env,
        text=True,
    )
    stdout_lines = [line for line in completed.stdout.splitlines() if line.strip()]
    return json.loads(stdout_lines[-1])


def _assert_modules(
    result: dict[str, bool], expected_after: dict[str, bool], label: str
):
    for module_name in MODULE_NAMES:
        assert result[module_name] is False, (
            f"{label}: {module_name} loaded before probe"
        )
        assert result[f"{module_name}_after"] is expected_after[module_name], (
            f"{label}: unexpected {module_name} import state"
        )


@pytest.mark.parametrize(
    ("label", "code", "expected_after", "required_modules"),
    PROBE_CASES,
)
def test_storage_import_side_effects(
    label: str,
    code: str,
    expected_after: dict[str, bool],
    required_modules: tuple[str, ...],
):
    missing_modules = [
        module_name
        for module_name in required_modules
        if importlib.util.find_spec(module_name) is None
    ]
    if missing_modules:
        pytest.skip(f"missing optional dependency: {', '.join(missing_modules)}")

    result = _probe_modules_loaded(code)
    _assert_modules(result, expected_after, label)


================================================
FILE: tests/no_instance/test_no_default_instance.py
================================================
import lamindb as ln
import pandas as pd
import pytest
from lamindb_setup.errors import CurrentInstanceNotConfigured


def test_no_read_only_warning(ccaplog):
    ln.Artifact.connect("laminlabs/lamindata")
    ln.DB("laminlabs/lamindata")

    assert "connected in read-only mode" not in ccaplog.text


def test_instance_not_connected():
    assert ln.setup.settings.instance.slug == "none/none"

    with pytest.raises(CurrentInstanceNotConfigured):
        ln.Artifact.filter().count()


def test_query_artifacts_lamindata():
    artifacts = ln.Artifact.connect("laminlabs/lamindata")
    n_artifacts = artifacts.count()
    assert n_artifacts > 0
    assert n_artifacts > artifacts.filter().count()


def test_get_artifact_lamindata():
    artifact = ln.Artifact.connect("laminlabs/lamindata").get(
        key="example_datasets/small_dataset1.parquet"
    )
    assert isinstance(artifact.load(), pd.DataFrame)


================================================
FILE: tests/permissions/conftest.py
================================================
import shutil
from subprocess import DEVNULL, run
from time import perf_counter

import lamindb_setup as ln_setup
import pytest
from lamin_utils import logger


def pytest_sessionstart():
    t_execute_start = perf_counter()
    # these are called in separate scripts because can't change connection
    # within the same python process due to django
    # init instance and setup RLS
    run(  # noqa: S602
        "python ./tests/permissions/scripts/setup_instance.py",
        shell=True,
        capture_output=False,
    )
    # populate permissions and models via the admin connection
    run(  # noqa: S602
        "python ./tests/permissions/scripts/setup_access.py",
        shell=True,
        capture_output=False,
    )

    total_time_elapsed = perf_counter() - t_execute_start
    print(f"time to setup the instance: {total_time_elapsed:.1f}s")


def pytest_sessionfinish(session: pytest.Session):
    logger.set_verbosity(1)
    shutil.rmtree("./default_storage_permissions")
    ln_setup.delete("lamindb-test-permissions", force=True)
    run("docker stop pgtest && docker rm pgtest", shell=True, stdout=DEVNULL)  # noqa: S602


================================================
FILE: tests/permissions/jwt_utils.py
================================================
import json

import psycopg2


def sign_jwt(db_url, payload: dict) -> str:
    with psycopg2.connect(db_url) as conn, conn.cursor() as cur:
        cur.execute(
            """
                SELECT sign(
                    %s::json,
                    (SELECT security.get_secret('jwt_secret')),
                    %s
                )
                """,
            (json.dumps(payload), "HS256"),
        )
        token = cur.fetchone()[0]
        if not token:
            msg = "Failed to generate JWT"
            raise ValueError(msg)
        return token


================================================
FILE: tests/permissions/scripts/check_lamin_dev.py
================================================
import subprocess
from unittest.mock import patch

import lamindb as ln
import pytest
from lamindb_setup.core._hub_core import select_space, select_storage


def cleanup(records):
    for record in records:
        try:
            if isinstance(record, ln.Storage):
                record.artifacts.all().delete(permanent=True)
            record.delete(permanent=True)
        except Exception as e:
            print(f"Failed deleting {record}: {e}")


assert ln.setup.settings.user.handle == "testuser1"

ln.connect("laminlabs/lamin-dev")

assert ln.setup.settings.instance.slug == "laminlabs/lamin-dev"

# check that the rename resolves correctly (it was renamed)
assert ln.Artifact.connect("laminlabs/lamin-dev1072025").db == "default"

space_name = "Our test space for CI"
space = ln.Space.get(name=space_name)

# check that we throw an error if no storage location is managed by the space
storage_loc = ln.Storage.filter(space=space).one_or_none()
if storage_loc is not None:
    ln.Run.filter(report__storage=storage_loc).delete(permanent=True)
    storage_loc.artifacts.all().delete(permanent=True)
    storage_loc.delete(permanent=True)

with pytest.raises(ln.errors.NoStorageLocationForSpace) as error:
    ln.track(space=space_name)  # this fails to save the env artifact
    ln.context._transform = None
    ln.context._run = None

# now create the storage location in the space
storage_loc = ln.Storage("create-s3", space=space).save()
ln.track(space=space_name)
try:
    assert ln.context.space.name == space_name
    ulabel = ln.ULabel(name="My test ulabel in test space").save()

    # cleanup if the artifact already exists
    artifact = ln.Artifact(".gitignore", key="mytest")
    if (
        artifact_cleanup := ln.Artifact.filter(hash=artifact.hash).one_or_none()
    ) is not None:
        artifact_cleanup.delete(permanent=True)

    # cleanup if the directory artifact already exists
    artifact_dir = ln.Artifact("./scripts", key="mytest-dir")
    if (
        artifact_cleanup := ln.Artifact.filter(hash=artifact_dir.hash).one_or_none()
    ) is not None:
        artifact_cleanup.delete(permanent=True)

    artifact = ln.Artifact(".gitignore", key="mytest").save()
    artifact_dir = ln.Artifact("./scripts", key="mytest-dir").save()

    # check that exist
    ln.ULabel.get(name="My test ulabel in test space")
    ln.Artifact.get(key="mytest")
    ln.Artifact.get(key="mytest-dir")

    assert ulabel.space == space  # ulabel should end up in the restricted space
    assert artifact.space == space
    # the below check doesn't work: another worker might have associated another storage location with the space, and then the artifact ends up in that
    # assert artifact.storage == storage_loc
    # hence this check
    assert artifact.storage in ln.Storage.filter(space=space)
    assert ln.context.transform.space == space
    assert ln.context.run.space == space

    # move the artifact to another storage location
    space_test_move = ln.Space.get(name="test-move")
    original_path = artifact.path
    artifact.space = space_test_move
    # cancel save
    with patch("builtins.input", return_value="x"):
        artifact.save()
    # save to the new storage location
    with patch("builtins.input", return_value="1"):
        artifact.save()
    assert artifact.space == space_test_move
    assert artifact.storage in ln.Storage.filter(space=space_test_move)
    assert not original_path.exists()
    assert artifact.path.as_posix().startswith(artifact.storage.root)
    assert artifact.path.exists()

    # move the directory artifact to another storage location
    assert artifact_dir.space == space
    assert artifact_dir.path.is_dir()
    assert artifact_dir.storage in ln.Storage.filter(space=space)
    original_path_dir = artifact_dir.path

    artifact_dir.space = space_test_move
    # save to the new storage location
    with patch("builtins.input", return_value="0"):
        artifact_dir.save()
    assert artifact_dir.space == space_test_move
    assert artifact_dir.storage in ln.Storage.filter(space=space_test_move)
    original_path_dir.fs.invalidate_cache()
    assert not original_path_dir.exists()
    assert artifact_dir.path.as_posix().startswith(artifact_dir.storage.root)
    assert artifact_dir.path.is_dir()

    # update the space of the storage location
    space2 = ln.Space.get(name="Our test space for CI 2")
    storage_loc.space = space2
    storage_loc.save()

    response_storage = select_storage(lnid=storage_loc.uid)
    response_space = select_space(lnid=space2.uid)
    assert response_storage["space_id"] == response_space["id"]

    # connect to the instance before saving
    subprocess.run(  # noqa: S602
        "lamin connect laminlabs/lamin-dev",
        shell=True,
        check=True,
    )
    result = subprocess.run(  # noqa: S602
        "lamin save .gitignore --key mytest --space 'Our test space for CI 2'",
        shell=True,
        capture_output=True,
    )
    assert "key='mytest'" in result.stdout.decode()
    assert "storage path:" in result.stdout.decode()
    assert result.returncode == 0

finally:
    try:
        storage_loc.run = None
        storage_loc.save()
    except:  # noqa
        pass
    cleanup(
        (
            ulabel,
            artifact,
            artifact_dir,
            ln.context.transform.latest_run,
            ln.context.transform,
            storage_loc,
        )
    )


================================================
FILE: tests/permissions/scripts/setup_access.py
================================================
import lamindb as ln  # noqa
import hubmodule
import hubmodule.models as hm
from uuid import uuid4
from hubmodule.dev.migrate.deploy import _apply_migrations_with_tracking
from hubmodule.dev.setup.install import (
    _setup_extensions,
    _setup_secret,
    _setup_utils_db_modules,
)
from hubmodule.sql_generators._rls import RLSGenerator
from hubmodule.sql_generators._dbwrite import install_dbwrite
from laminhub_instancedb.postgres import DbRoleHandler
from pathlib import Path


# create a db connection url that works with RLS
instance_id = ln.setup.settings.instance._id


def create_jwt_user(dsn_admin: str, jwt_role_name: str):
    db_role_handler = DbRoleHandler(dsn_admin)
    jwt_db_url = db_role_handler.create(
        jwt_role_name, expires_in=None, alter_if_exists=True
    )
    db_role_handler.permission.grant_write_jwt(jwt_role_name)
    return jwt_db_url


pgurl = "postgresql://postgres:pwd@0.0.0.0:5432/pgtest"  # admin db connection url
jwt_role_name = f"{instance_id.hex}_jwt"
jwt_db_url = create_jwt_user(pgurl, jwt_role_name=jwt_role_name)

_setup_extensions(pgurl)
_setup_secret(pgurl)
_setup_utils_db_modules(pgurl)
migrations_sql_dir = Path(hubmodule.__file__).parent / "sql/0004_migrations"
_apply_migrations_with_tracking(pgurl, migrations_sql_dir)

rls_generator = RLSGenerator(pgurl, jwt_role_name=jwt_role_name, public_role_name=None)

for i, table in enumerate(rls_generator._list_tables()):
    print(i, table.table_name, table.foreign_keys, table.has_space_id)

rls_generator.setup()

print("Created jwt db connection")

install_dbwrite(pgurl)

print("Installed dbwrite")

# create models

full_access = ln.Space(name="full access").save()  # type: ignore
select_access = ln.Space(name="select access").save()  # type: ignore
no_access = ln.Space(name="no access").save()  # type: ignore
# set read role for the default space
usettings = ln.setup.settings.user
account = hm.Account(id=usettings._uuid.hex, uid=usettings.uid, role="read").save()

# create a test user object
ln.User(uid="testuid1", handle="testuser", name="Test User").save()

# no access space
ulabel = ln.ULabel(name="no_access_ulabel")
ulabel.space = no_access
ulabel.save()
# set up access to this individual record with a dummy role,
# will work only after the role is changed to read, write or admin
hm.AccessRecord(
    account=account, record_type="lamindb_ulabel", record_id=ulabel.id, role="dummy"
).save()

project = ln.Project(name="No_access_project")  # type: ignore
project.space = no_access
project.save()

hm.AccessRecord(
    account=account, record_type="lamindb_project", record_id=project.id, role="dummy"
).save()

# setup write access space
hm.AccessSpace(account=account, space=full_access, role="write").save()

ulabel = ln.ULabel(name="full_access_ulabel")
ulabel.space = full_access
ulabel.save()
# setup read access space
hm.AccessSpace(account=account, space=select_access, role="read").save()

ulabel = ln.ULabel(name="select_ulabel")
ulabel.space = select_access
ulabel.save()
# artificial but better to test
# create a link table referencing rows in different spaces
ulabel.projects.add(project)

# default space, only select access by default
ulabel = ln.ULabel(name="default_space_ulabel").save()
ulabel.projects.add(project)

project = ln.Project(name="default_space_project").save()
ulabel.projects.add(project)

# create a link table referencing ulabel from the default space and project from select space
project = ln.Project(name="select_project")
project.space = select_access
project.save()

ulabel.projects.add(project)

# setup team and relevent models
team_access = ln.Space(name="team access").save()  # type: ignore
team = hm.Team(id=uuid4().hex, uid="teamuiduid11", name="test_team", role="read").save()
hm.AccountTeam(account=account, team=team).save()
hm.AccessSpace(team=team, space=team_access, role="read").save()

feature = ln.Feature(name="team_access_feature", dtype=float)
feature.space = team_access
feature.save()

# artifact for testing tracking error and artifactblock
artifact = ln.Artifact("README.md", description="test tracking error")
artifact.space = select_access
artifact.save()

# artifact for testing tracking error and locking
artifact = ln.Artifact(".gitignore", description="test locking")
artifact.space = full_access
artifact.is_locked = True
artifact.save()

# create a single record in the default space
record = ln.Record(name="test-record", is_type=False).save()
assert record.space_id == 1

print("Created models")

# save jwt db connection

ln.setup.settings.instance._db = jwt_db_url
ln.setup.settings.instance._persist()


================================================
FILE: tests/permissions/scripts/setup_instance.py
================================================
import lamindb_setup as ln_setup
from laminci.db import setup_local_test_postgres

pgurl = setup_local_test_postgres()

ln_setup.init(
    storage="./default_storage_permissions",
    name="lamindb-test-permissions",
    db=pgurl,
)

# can't add this app in the init because don't want t trigger the initial migration
# that conflicts with _install_db_module
ln_setup.settings.instance._schema_str = "hubmodule"
ln_setup.settings.instance._persist()


================================================
FILE: tests/permissions/test_rls_dbwritelog.py
================================================
import subprocess
import time
from pathlib import Path
from uuid import uuid4

import hubmodule.models as hm
import lamindb as ln
import psycopg2
import pytest
from django.db import connection, transaction
from django.db.utils import IntegrityError, InternalError, ProgrammingError
from hubmodule.sql_generators._dbwrite import uninstall_dbwrite
from jwt_utils import sign_jwt
from lamindb.models.artifact import track_run_input
from lamindb_setup.core.django import DBToken, db_token_manager
from psycopg2.extensions import adapt

pgurl = "postgresql://postgres:pwd@0.0.0.0:5432/pgtest"  # admin db connection url

user_uuid = ln.setup.settings.user._uuid.hex
expiration = time.time() + 2000
# full collaborator token
token = sign_jwt(
    pgurl, {"account_id": user_uuid, "exp": expiration, "type": "collaborator"}
)
# read-only token
token_read = sign_jwt(
    pgurl, {"account_id": user_uuid, "exp": expiration, "type": "read-only"}
)
# init an instance of DBToken manually
db_token = DBToken({})
db_token._token = token
db_token._token_query = f"SELECT set_token({adapt(token).getquoted().decode()}, true);"
db_token._expiration = expiration

db_token_manager.set(db_token)


def test_token_expiration():
    # init connection.connection
    with connection.cursor() as cur:
        pass

    expired_token = sign_jwt(
        pgurl,
        {"account_id": user_uuid, "exp": time.time() - 1000, "type": "collaborator"},
    )
    # check that an expired token is invalid
    with (
        pytest.raises(psycopg2.errors.RaiseException),
        connection.connection.cursor() as cur,
    ):
        cur.execute("SELECT set_token(%s);", (expired_token,))


def test_authentication():
    # just check that the token was setup
    with connection.cursor() as cur:
        cur.execute(
            "SELECT 1 in (SELECT id FROM public.check_access() WHERE role = 'read');"
        )
        result = cur.fetchall()[0][0]
    assert result
    # check querying without setting jwt
    with (
        pytest.raises(psycopg2.errors.RaiseException),
        connection.connection.cursor() as cur,
    ):
        cur.execute("SELECT * FROM lamindb_ulabel;")
    # test that auth can't be hijacked
    # false table created before
    with (
        pytest.raises(psycopg2.errors.DuplicateTable),
        connection.connection.cursor() as cur,
    ):
        cur.execute(
            """
            CREATE TEMP TABLE access(
                id int,
                role varchar(20),
                type text
            ) ON COMMIT DROP;
            SELECT set_token(%s);
            """,
            (token,),
        )
    # check that jwt user can't set arbitrary account_id manually
    with (
        pytest.raises(psycopg2.errors.RaiseException),
        connection.connection.cursor() as cur,
    ):
        cur.execute(
            """
            CREATE TEMP TABLE access(
                id int,
                role varchar(20),
                type text
            ) ON COMMIT DROP;
            INSERT INTO access (id, role, type)
            VALUES (1, 'admin', 'space');
            SELECT * FROM check_access();
            """
        )
    # check manual insert
    with (
        pytest.raises(psycopg2.errors.InsufficientPrivilege),
        connection.connection.cursor() as cur,
    ):
        cur.execute(
            """
            SELECT set_token(%s);
            INSERT INTO access (id, role, type)
            VALUES (1, 'admin', 'space');
            """,
            (token,),
        )
    # test access to the security schema
    with (
        pytest.raises(psycopg2.errors.InsufficientPrivilege),
        connection.connection.cursor() as cur,
    ):
        cur.execute("SELECT security.get_secret('jwt_secret');")
    # test read-only token
    with connection.connection.cursor() as cur:
        cur.execute("SELECT set_token(%s); SELECT * FROM check_access()", (token_read,))
        result = cur.fetchall()
    assert len(result) == 1
    assert result[0] == (1, "read", "space")

    assert ln.base.users._user_has_write_access()


def test_select_without_db_token():
    # with db token can be read in the default space
    with connection.cursor() as cur:
        cur.execute("SELECT * FROM lamindb_record;")
        results = cur.fetchall()
    assert len(results) == 1
    # the same
    assert ln.Record.filter().count() == 1
    # errors if can't select
    ln.Record.get(1)
    # no db token, everything in the default space
    with (
        pytest.raises(psycopg2.errors.RaiseException),
        connection.connection.cursor() as cur,
    ):
        cur.execute("SELECT * FROM lamindb_record;")
    with (
        pytest.raises(psycopg2.errors.RaiseException),
        connection.connection.cursor() as cur,
    ):
        cur.execute("SELECT * FROM lamindb_record WHERE id = 1;")
    # no db token, in different spaces
    with (
        pytest.raises(psycopg2.errors.RaiseException),
        connection.connection.cursor() as cur,
    ):
        cur.execute("SELECT * FROM lamindb_artifact;")
    with (
        pytest.raises(psycopg2.errors.RaiseException),
        connection.connection.cursor() as cur,
    ):
        cur.execute("SELECT * FROM lamindb_ulabel;")
    # no db token, utility tables
    with (
        pytest.raises(psycopg2.errors.RaiseException),
        connection.connection.cursor() as cur,
    ):
        cur.execute("SELECT * FROM lamindb_user;")
    with (
        pytest.raises(psycopg2.errors.RaiseException),
        connection.connection.cursor() as cur,
    ):
        cur.execute("SELECT * FROM lamindb_space;")


def test_fine_grained_permissions_account_and_dbwrite():
    # check select
    assert ln.ULabel.filter().count() == 3
    assert ln.Project.filter().count() == 2

    ulabel = ln.ULabel.get(name="default_space_ulabel")
    assert ulabel.projects.all().count() == 2
    # check delete
    # should delete
    ulabel_del = ln.ULabel.get(name="full_access_ulabel")
    ulabel_del_id = ulabel_del.id
    ulabel_del.delete(permanent=True)
    assert ln.ULabel.filter().count() == 2
    # check the logs for delete
    log_rec = (
        hm.DbWrite.filter(sqlrecord_id=ulabel_del_id, table_name="lamindb_ulabel")
        .order_by("-id")
        .first()
    )
    assert log_rec.event_type == "DELETE"
    assert log_rec.data is not None
    assert log_rec.created_by_id == 1
    # check the logs for insert
    log_rec = (
        hm.DbWrite.filter(sqlrecord_id=ulabel_del_id, table_name="lamindb_ulabel")
        .order_by("id")
        .first()
    )
    assert log_rec.event_type == "INSERT"
    assert log_rec.data is None
    assert log_rec.created_by_id is None  # this was inserted without setting a db token
    # should not delete, does not error for some reason
    ln.ULabel.get(name="select_ulabel").delete(permanent=True)
    assert ln.ULabel.filter().count() == 2
    # default space
    ulabel.delete(permanent=True)
    assert ln.ULabel.filter().count() == 2
    # check insert
    # should succeed
    space = ln.Space.get(name="full access")
    ulabel = ln.ULabel(name="new label")
    ulabel.space = space
    ulabel.save()
    # should fail
    with pytest.raises(ln.errors.NoWriteAccess):
        ln.ULabel(name="new label fail").save()
    for space_name in ["select access", "no access"]:
        space = ln.Space.get(name=space_name)
        ulabel = ln.ULabel(name="new label fail")
        ulabel.space = space
        with pytest.raises(ln.errors.NoWriteAccess):
            ulabel.save()
    # check update
    # should succeed
    ulabel = ln.ULabel.get(name="new label")
    ulabel.name = "new label update"
    ulabel.save()
    ulabel = ln.ULabel.get(name="new label update")  # check that it is saved
    # check the logs for update
    log_rec = (
        hm.DbWrite.filter(sqlrecord_id=ulabel.id, table_name="lamindb_ulabel")
        .order_by("-id")
        .first()
    )
    assert log_rec.event_type == "UPDATE"
    assert log_rec.data["name"] == "new label"  # changed
    assert "id" not in log_rec.data  # didn't change
    assert log_rec.created_by_id == 1
    # should fail
    ulabel = ln.ULabel.get(name="select_ulabel")
    ulabel.name = "select_ulabel update"
    with pytest.raises(ln.errors.NoWriteAccess):
        ulabel.save()
    # default space
    ulabel = ln.ULabel.get(name="default_space_ulabel")
    ulabel.name = "default_space_ulabel update"
    with pytest.raises(ln.errors.NoWriteAccess):
        ulabel.save()
    # check link tables
    # check insert
    project = ln.Project(name="Myproject")
    project.space = ln.Space.get(name="full access")
    project.save()
    ulabel = ln.ULabel.get(name="new label update")
    ulabel.projects.add(project)
    assert ulabel.projects.all().count() == 1
    # check select of a link table referencing unavailable rows
    assert ln.ULabel.get(name="select_ulabel").projects.all().count() == 0
    # test SpaceBlock
    space = ln.Space.get(name="select access")
    with pytest.raises(ln.errors.NoWriteAccess):
        ln.models.SpaceBlock(space=space, content="test", kind="readme").save()
    # test ArtifactBlock, artifact is read-only
    artifact = ln.Artifact.get(description="test tracking error")
    with pytest.raises(ProgrammingError):
        ln.models.ArtifactBlock(artifact=artifact, content="test", kind="readme").save()
    # test BranchBlock, the account is read-only
    branch = ln.Branch.get(1)  # main branch in all space
    with pytest.raises(ProgrammingError):
        ln.models.BranchBlock(branch=branch, content="test", kind="readme").save()


def test_fine_grained_permissions_team():
    assert ln.Feature.filter().count() == 1
    ln.Feature.get(name="team_access_feature")


def test_fine_grained_permissions_single_records():
    assert not ln.ULabel.filter(name="no_access_ulabel").exists()
    assert not ln.Project.filter(name="No_access_project").exists()

    # check that the logs are not available for the ulabel
    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute("SELECT id FROM lamindb_ulabel WHERE name = 'no_access_ulabel'")
        ulabel_id = cur.fetchone()[0]
    assert not hm.DbWrite.filter(
        sqlrecord_id=ulabel_id, table_name="lamindb_ulabel"
    ).exists()

    # switch access to this ulabel to read
    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute(
            """
            UPDATE hubmodule_accessrecord SET role = 'read'
            WHERE account_id = %s AND record_type = 'lamindb_ulabel'
            """,
            (user_uuid,),
        )

    ulabel = ln.ULabel.get(name="no_access_ulabel")

    # check that the logs are available now
    assert hm.DbWrite.filter(
        sqlrecord_id=ulabel.id, table_name="lamindb_ulabel"
    ).exists()

    new_name = "new_name_single_rls_access_ulabel"
    ulabel.name = new_name
    with pytest.raises(ln.errors.NoWriteAccess):
        ulabel.save()

    # switch access for the project to read
    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute(
            """
            UPDATE hubmodule_accessrecord SET role = 'read'
            WHERE account_id = %s AND record_type = 'lamindb_project'
            """,
            (user_uuid,),
        )
    # now the project is readable
    project = ln.Project.get(name="No_access_project")

    # can't insert into lamindb_ulabelproject because the ulabel is read-only
    with pytest.raises(ProgrammingError):
        ulabel.projects.add(project)

    # switch access for the ulabel to write
    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute(
            """
            UPDATE hubmodule_accessrecord SET role = 'write'
            WHERE account_id = %s AND record_type = 'lamindb_ulabel'
            """,
            (user_uuid,),
        )

    ulabel.save()

    # can insert into lamindb_ulabelproject because the ulabel is now write-able
    # and the project is read-only, but this doesn't matter as the principal key is ulabel
    ulabel.projects.add(project)
    assert ulabel.projects.count() == 1

    ulabel.delete(permanent=True)
    assert not ln.ULabel.filter(name="no_access_ulabel").exists()


# tests that token is set properly in atomic blocks
def test_atomic():
    with transaction.atomic():
        assert ln.Feature.filter().count() == 1
        # test with nested
        with transaction.atomic():
            assert ln.Feature.filter().count() == 1

            feature = ln.Feature(name="atomic_feature", dtype=float)
            feature.space = ln.Space.get(name="full access")
            feature.save()

    assert ln.Feature.filter().count() == 2


def test_utility_tables():
    # can select in these tables
    assert ln.Space.filter().count() == 5
    # can't select
    assert hm.Account.filter().count() == 0
    assert hm.Team.filter().count() == 0
    assert hm.AccountTeam.filter().count() == 0
    assert hm.AccessSpace.filter().count() == 0
    assert hm.AccessRecord.filter().count() == 0
    # can't update a space
    space = ln.Space.get(id=1)  # default space
    space.name = "new name"
    with pytest.raises(ProgrammingError):
        space.save()
    with pytest.raises(ProgrammingError):
        ln.Space(name="new space").save()
    # can't insert
    with pytest.raises(ProgrammingError):
        hm.Account(id=uuid4().hex, uid="accntid2", role="admin").save()


def test_user_rls():
    assert ln.User.filter().count() == 2
    # should fail because can modify only the current user
    user = ln.User.get(handle="testuser")
    user.name = "New Name"
    with pytest.raises(ProgrammingError):
        user.save()
    # can't insert a user with a different uid
    with pytest.raises(ProgrammingError):
        ln.User(handle="insert_new_user", uid="someuidd").save()
    # also triggers RLS
    with pytest.raises(ProgrammingError):
        ln.User(handle="insert_new_user", uid=user.uid).save()
    # try to insert a user with the same uid
    # should not trigger RLS because the uid is the same, it should throw an IntegrityError
    with pytest.raises(IntegrityError):
        ln.User(handle="insert_new_user", uid=ln.setup.settings.user.uid).save()
    # can modify the current user
    user = ln.User.get(1)
    user.name = "New Name"
    user.save()


def test_write_role():
    # switch user role to write
    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute(
            "UPDATE hubmodule_account SET role = 'write' WHERE id = %s", (user_uuid,)
        )

    ln.ULabel(name="new label account default space").save()

    # switch user role back to read and team role to write
    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute(
            "UPDATE hubmodule_account SET role = 'read' WHERE id = %s", (user_uuid,)
        )
        cur.execute(
            "UPDATE hubmodule_team SET role = 'write' WHERE uid = 'teamuiduid11'",
        )

    ln.ULabel(name="new label team default space").save()

    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute(
            "UPDATE hubmodule_team SET role = 'read' WHERE uid = 'teamuiduid11'",
        )


def test_locking():
    artifact = ln.Artifact.get(description="test locking")
    artifact.description = "new description"
    with pytest.raises(ln.errors.NoWriteAccess) as e:
        artifact.save()
    assert "It is not allowed to modify or create locked" in str(e)


def test_tracking_error():
    # switch user role to write to create the transform and run
    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute(
            "UPDATE hubmodule_account SET role = 'write' WHERE id = %s", (user_uuid,)
        )

    artifact = ln.Artifact.get(description="test tracking error")

    transform = ln.Transform(key="My transform").save()
    run = ln.Run(transform).save()

    # this error because ln.setup.settings.instance._db_permissions is not jwt
    # it is None
    with pytest.raises(ln.errors.NoWriteAccess) as e:
        track_run_input(artifact, run)
    assert "You’re not allowed to write to the instance " in str(e)

    # the instance is local so we set this manually
    ln.setup.settings.instance._db_permissions = "jwt"
    # artifact.space is not available for writes
    with pytest.raises(ln.errors.NoWriteAccess) as e:
        track_run_input(artifact, run)
    assert "You’re not allowed to write to the space " in str(e)

    # this artifact is locked
    artifact = ln.Artifact.get(description="test locking")
    with pytest.raises(ln.errors.NoWriteAccess) as e:
        track_run_input(artifact, run)
    assert "It is not allowed to modify locked records" in str(e)

    # switch user role back to read
    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute(
            "UPDATE hubmodule_account SET role = 'read' WHERE id = %s", (user_uuid,)
        )
    # as the user is read-only now, 2 spaces are unavailable for writes (artifact.space, run.space)
    artifact = ln.Artifact.get(description="test tracking error")
    with pytest.raises(ln.errors.NoWriteAccess) as e:
        track_run_input(artifact, run)
    assert "You’re not allowed to write to the spaces " in str(e)

    ln.setup.settings.instance._db_permissions = None


def test_token_reset():
    db_token_manager.reset()

    # account_id is not set
    with pytest.raises(InternalError) as error:
        ln.ULabel.filter().count()
    assert "JWT is not set" in error.exconly()

    with pytest.raises(InternalError) as error, transaction.atomic():
        ln.ULabel.filter().count()
    assert "JWT is not set" in error.exconly()


def test_dbwrite_uninstall():
    triggers_exist_query = (
        "SELECT EXISTS (SELECT 1 FROM pg_trigger WHERE tgname LIKE 'dbwrite_%')"
    )
    table_exists_query = "SELECT to_regclass('public.hubmodule_dbwrite') IS NOT NULL"

    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute(triggers_exist_query)
        triggers_exist = cur.fetchone()[0]
    assert triggers_exist

    uninstall_dbwrite(pgurl, drop_table=False)

    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute(triggers_exist_query)
        triggers_exist = cur.fetchone()[0]
        assert not triggers_exist

        cur.execute(table_exists_query)
        table_exists = cur.fetchone()[0]
        assert table_exists

    uninstall_dbwrite(pgurl, drop_table=True)

    with psycopg2.connect(pgurl) as conn, conn.cursor() as cur:
        cur.execute(table_exists_query)
        table_exists = cur.fetchone()[0]
    assert not table_exists


def test_lamin_dev():
    script_path = Path(__file__).parent.resolve() / "scripts/check_lamin_dev.py"
    subprocess.run(  # noqa: S602
        f"python {script_path}",
        shell=True,
        check=True,
    )


================================================
FILE: tests/profiling/import_lamindb.py
================================================
import lamindb as ln  # noqa: F401


================================================
FILE: tests/profiling/import_lamindb_and_connect.py
================================================
import lamindb as ln

# should connect to another instance than laminlabs/lamindata
# because the former is used to log the test run
ln.connect("laminlabs/lamin-site-assets")


================================================
FILE: tests/profiling/import_lamindb_core_storage.py
================================================
import lamindb.core.storage  # noqa: F401


================================================
FILE: tests/profiling/import_records_from_dataframe.py
================================================
import argparse
from datetime import datetime
from random import Random
from time import perf_counter

import lamindb as ln
import pandas as pd


def generate_values(dtype: str, n_rows: int, rng: Random):
    cell_types = [
        "T cell",
        "B cell",
        "natural killer cell",
        "monocyte",
        "epithelial cell",
    ]
    if dtype in {"float", "num"}:
        return [round(rng.uniform(0.0, 100.0), 3) for _ in range(n_rows)]
    if dtype.startswith("cat["):
        return [rng.choice(cell_types) for _ in range(n_rows)]
    raise ValueError(f"Unsupported dtype: {dtype}")


@ln.flow("JuJZZEsit1KV")
def main(n_rows: int):
    feature_names = [
        "age_or_mean_of_age_range",
        "array_col",
        "cell_type_by_model",
    ]
    rng = Random(0)
    features = ln.Feature.filter(name__in=feature_names)
    dtypes_by_feature = {feature.name: feature.dtype_as_str for feature in features}

    data: dict[str, list] = {}
    print("Generating random dataframe values...")
    for feature in features:
        data[feature.name] = generate_values(
            dtypes_by_feature[feature.name], n_rows, rng
        )
    df = pd.DataFrame(data)
    print(df.head(5))

    print("Running Record.from_dataframe()...")
    from_dataframe_start = perf_counter()
    records = ln.Record.from_dataframe(
        df,
        type=f"test-import-records-from-dataframe-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}",
    )
    from_dataframe_duration_sec = perf_counter() - from_dataframe_start
    print(f"... completed in {from_dataframe_duration_sec:.6f}s")

    print("Saving records...")
    save_start = perf_counter()
    records.save()
    save_duration_sec = perf_counter() - save_start
    print(f"... completed in {save_duration_sec:.6f}s")

    run = ln.context.run
    params = run.params or {}
    params.update(
        {
            "from_dataframe_duration_sec": round(from_dataframe_duration_sec, 6),
            "save_duration_sec": round(save_duration_sec, 6),
        }
    )
    run.params = params
    run.save()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Prepare and optionally save test Records rows via Record.from_dataframe()."
    )
    parser.add_argument("--rows", type=int, default=100)
    args = parser.parse_args()
    ln.connect("laminlabs/lamindata")
    main(n_rows=args.rows)


================================================
FILE: tests/storage/conftest.py
================================================
import shutil
from pathlib import Path
from subprocess import DEVNULL, run
from time import perf_counter

import lamindb as ln
import lamindb_setup as ln_setup
import pytest
from lamin_utils import logger
from laminci.db import setup_local_test_postgres


def create_test_instance(pgurl: str):
    ln_setup.init(
        storage="./default_storage_unit_storage",
        modules="bionty",
        name="lamindb-unit-tests-storage",
        db=pgurl,
    )
    ln_setup.register()  # temporarily

    ln.settings.creation.artifact_silence_missing_run_warning = True
    ln.settings.track_run_inputs = False
    ln.Storage("s3://lamindb-ci/test-data").save()
    ln.Storage("s3://lamindb-test/core").save()
    ln.Storage("s3://lamindb-test/storage").save()


def pytest_sessionstart():
    t_execute_start = perf_counter()

    ln_setup._TESTING = True
    try:
        pgurl = setup_local_test_postgres()
    except RuntimeError:
        run("docker stop pgtest && docker rm pgtest", shell=True, stdout=DEVNULL)  # noqa: S602
        pgurl = setup_local_test_postgres()
    try:
        create_test_instance(pgurl)
    except Exception as e:
        print("failed to create test instance:", e)
        print("deleting the instance")
        delete_test_instance()
        # below currently fails because cannot create two instances in the same session
        # create_test_instance(pgurl)
        print("now rerun")
        quit()
    total_time_elapsed = perf_counter() - t_execute_start
    print(f"time to setup the instance: {total_time_elapsed:.1f}s")
    assert ln.Storage.filter(root="s3://lamindb-ci/test-data").one_or_none() is not None


def delete_test_instance():
    logger.set_verbosity(1)
    if Path("./default_storage_unit_storage").exists():
        shutil.rmtree("./default_storage_unit_storage")
    # handle below better in the future
    for path in (
        "s3://lamindb-test/storage/.lamindb",
        "s3://lamindb-test/core/.lamindb",
        "s3://lamindb-ci/lamindb-unit-tests-cloud/.lamindb",
        "s3://lamindb-ci/test-settings-switch-storage/.lamindb",
    ):
        upath = ln_setup.core.upath.UPath(path)
        if upath.exists():
            upath.rmdir()
    ln_setup.delete("lamindb-unit-tests-storage", force=True)


def pytest_sessionfinish(session: pytest.Session):
    delete_test_instance()
    run("docker stop pgtest && docker rm pgtest", shell=True, stdout=DEVNULL)  # noqa: S602


@pytest.fixture
def ccaplog(caplog):
    """Add caplog handler to our custom logger at session start."""
    from lamin_utils._logger import logger

    # Add caplog's handler to our custom logger
    logger.addHandler(caplog.handler)

    yield caplog

    # Clean up at the end of the session
    logger.removeHandler(caplog.handler)


================================================
FILE: tests/storage/test_artifact_storage.py
================================================
import shutil

import anndata as ad
import lamindb as ln
import pytest
from lamindb.errors import (
    IntegrityError,
)


def test_create_from_anndata_in_existing_cloud_storage():
    filepath = "s3://lamindb-test/core/scrnaseq_pbmc68k_tiny.h5ad"
    artifact = ln.Artifact.from_anndata(
        filepath, description="test_create_from_anndata_cloudpath"
    )
    assert artifact.n_observations == 70
    artifact.save()
    assert ln.Artifact.get(path=artifact.path) == artifact
    # check that the local filepath has been cleared
    assert not hasattr(artifact, "_local_filepath")
    assert artifact.path.as_posix().startswith("s3://lamindb-test/core")


@pytest.mark.parametrize(
    "filepath_str",
    ["s3://lamindb-ci/test-data/test.parquet", "s3://lamindb-ci/test-data/test.csv"],
)
@pytest.mark.parametrize("skip_check_exists", [False, True])
@pytest.mark.parametrize("skip_size_and_hash", [False, True])
def test_create_small_file_from_remote_path(
    filepath_str, skip_check_exists, skip_size_and_hash
):
    ln.settings.creation.artifact_skip_size_hash = skip_size_and_hash
    artifact = ln.Artifact(
        filepath_str,
        skip_check_exists=skip_check_exists,
    )
    artifact.save()
    # test cache()
    file_from_local = ln.Artifact(artifact.cache(), description="test")
    # test hash equivalency when computed on local machine
    if not skip_size_and_hash:
        assert file_from_local.hash == artifact.hash
        assert file_from_local._hash_type == "md5"
        assert artifact._hash_type == "md5"
    assert artifact.path.as_posix() == filepath_str
    assert artifact.load().iloc[0].tolist() == [
        0,
        "Abingdon island giant tortoise",
        "Chelonoidis abingdonii",
        106734,
        "ASM359739v1",
        "GCA_003597395.1",
        "Full genebuild",
        "-",
        "-",
    ]
    artifact.delete(permanent=True, storage=False)
    ln.settings.creation.artifact_skip_size_hash = False


def test_versioning_arifact_from_existing_path(ccaplog):
    artifact1 = ln.Artifact("s3://lamindb-ci/test-data/test.parquet").save()
    artifact2 = ln.Artifact(
        "s3://lamindb-ci/test-data/test.csv", revises=artifact1
    ).save()
    assert "you are saving to a non-latest version of the artifact" not in ccaplog.text
    assert artifact1.stem_uid == artifact2.stem_uid
    assert artifact1.uid != artifact2.uid
    artifact1.delete(permanent=True, storage=False)
    artifact2.delete(permanent=True, storage=False)


def test_create_big_file_from_remote_path():
    # the point of this test is check the multi-upload hash
    filepath_str = "s3://lamindb-test/core/human_immune.h5ad"
    # we don't use from_anndata() here because we test this with a small file for shorter run time
    artifact = ln.Artifact(filepath_str)
    assert not artifact._key_is_virtual
    assert artifact._real_key is None
    assert artifact.key == "human_immune.h5ad"
    assert artifact._hash_type == "md5-3"
    assert artifact.size == 21960324
    assert artifact.path.as_posix() == filepath_str
    # check _real_key
    artifact = ln.Artifact(filepath_str, key="adata_test_key.h5ad")
    assert artifact._key_is_virtual
    assert artifact.key == "adata_test_key.h5ad"
    assert artifact._real_key.endswith("human_immune.h5ad")
    assert artifact.path.as_posix() == filepath_str


def test_delete_artifact_from_non_managed_storage():
    artifact = ln.Artifact(
        "s3://lamindb-dev-datasets/file-to-test-for-delete.csv",
        description="My test file to delete from non-default storage",
    ).save()
    assert artifact.storage.instance_uid != ln.setup.settings.instance.uid
    assert artifact.key is not None
    filepath = artifact.path
    with pytest.raises(IntegrityError) as e:
        artifact.delete()
    assert e.exconly().startswith(
        "lamindb.errors.IntegrityError: Cannot simply delete artifacts"
    )
    artifact.delete(storage=False, permanent=True)
    assert (
        ln.Artifact.filter(
            description="My test file to delete from non-default storage",
            branch_id=None,
        ).first()
        is None
    )
    assert filepath.exists()


def test_huggingface_paths():
    artifact_adata = ln.Artifact(
        "hf://datasets/Koncopd/lamindb-test@main/anndata/pbmc68k_test.h5ad",
        description="hf adata",
    )
    artifact_adata.save()
    assert artifact_adata.key == "anndata/pbmc68k_test.h5ad"
    assert artifact_adata.hash is not None
    assert isinstance(artifact_adata.load(), ad.AnnData)
    assert artifact_adata._cache_path.exists()
    artifact_adata._cache_path.unlink()

    artifact_pq = ln.Artifact(
        "hf://datasets/Koncopd/lamindb-test/sharded_parquet", description="hf parquet"
    )
    artifact_pq.save()
    assert artifact_pq.hash is not None
    assert len(artifact_pq.open().files) == 11
    assert artifact_pq.cache().is_dir()
    shutil.rmtree(artifact_pq._cache_path)

    artifact_adata.delete(permanent=True, storage=False)
    artifact_pq.delete(permanent=True, storage=False)


def test_gcp_paths():
    artifact_folder = ln.Artifact(
        "gs://rxrx1-europe-west4/images/test/HEPG2-08", description="Test GCP folder"
    ).save()
    assert artifact_folder.hash == "6r5Hkce0UTy7X6gLeaqzBA"
    assert artifact_folder.n_files == 14772

    artifact_file = ln.Artifact(
        "gs://rxrx1-europe-west4/images/test/HEPG2-08/Plate1/B02_s1_w1.png",
        description="Test GCP file",
    ).save()
    assert artifact_file.hash == "foEgLjmuUHO62CazxN97rA"
    cache_path = artifact_file.cache()
    assert cache_path.is_file()

    cache_path.unlink()
    artifact_folder.delete(permanent=True, storage=False)
    artifact_file.delete(permanent=True, storage=False)


def test_http_paths():
    http_path = ln.UPath(
        "https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md"
    )
    artifact_readme = ln.Artifact(http_path, description="register http readme").save()
    # might change
    assert artifact_readme.hash is not None
    cache_path = artifact_readme.cache()
    assert cache_path.exists()
    assert cache_path.stat().st_size == http_path.stat().st_size
    cache_path.unlink()
    # just check saving for the second time (when Strage record is in the db)
    artifact_license = ln.Artifact(
        "https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/LICENSE",
        description="register http license",
    ).save()
    assert artifact_license.hash == "IQxRSNjvb7w2OLFeWqYlsg"

    artifact_readme.delete(permanent=True, storage=False)
    artifact_license.delete(permanent=True, storage=False)


# also see test in lamindb-setup/tests/storage/test_storage_stats.py
# there is also a test for GCP there
def test_folder_like_artifact_s3():
    study0_data = ln.Artifact("s3://lamindata/iris_studies/study0_raw_images")
    assert study0_data.hash == "IVKGMfNwi8zKvnpaD_gG7w"
    assert study0_data._hash_type == "md5-d"
    assert study0_data.n_files == 51
    assert study0_data.size == 658465


def test_single_file_directory_preserved(tmp_path):
    local_dir = tmp_path / "single_file_dir"
    local_dir.mkdir()
    (local_dir / "only.txt").write_text("single file")

    storage = ln.Storage.get(root="s3://lamindb-test/storage")
    artifact = ln.Artifact(
        local_dir, key="tests/single-file-directory", storage=storage
    ).save()
    assert artifact.path.as_posix().startswith("s3://lamindb-test/storage")
    assert artifact.n_files == 1
    assert artifact.path.is_dir()
    assert [file.name for file in artifact.path.iterdir()] == ["only.txt"]

    artifact.delete(permanent=True)


================================================
FILE: tests/storage/test_artifact_zarr.py
================================================
import shutil
from pathlib import Path

import anndata as ad
import lamindb as ln
import numpy as np
import pandas as pd
import pytest
from lamindb.core.storage._zarr import identify_zarr_type
from lamindb_setup.core.upath import (
    CloudPath,
)


@pytest.fixture(scope="session")
def get_small_adata():
    return ad.AnnData(
        X=np.array([[1, 2, 3], [4, 5, 6]]),
        obs={"feat1": ["A", "B"]},
        var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]),
        obsm={"X_pca": np.array([[1, 2], [3, 4]])},
    )


def test_zarr_upload_cache(get_small_adata):
    previous_storage = ln.setup.settings.storage.root_as_str
    ln.settings.storage = "s3://lamindb-test/core"

    zarr_path = Path("./test_adata.zarr")
    get_small_adata.write_zarr(zarr_path)

    artifact = ln.Artifact(zarr_path, key="test_adata.zarr")
    assert not artifact._storage_ongoing
    assert artifact.otype == "AnnData"
    assert artifact.n_files >= 1
    artifact.save()

    assert ln.Artifact.get(path=artifact.path) == artifact

    assert not artifact._storage_ongoing

    assert isinstance(artifact.path, CloudPath)
    assert artifact.path.exists()
    assert identify_zarr_type(artifact.path) == "anndata"

    shutil.rmtree(artifact.cache())

    cache_path = artifact._cache_path
    assert isinstance(artifact.load(), ad.AnnData)
    assert cache_path.is_dir()

    shutil.rmtree(cache_path)
    assert not cache_path.exists()
    artifact.cache()
    assert cache_path.is_dir()

    artifact.delete(permanent=True, storage=True)
    shutil.rmtree(zarr_path)

    # test zarr from memory
    artifact = ln.Artifact(get_small_adata, key="test_adata.anndata.zarr")
    assert not artifact._storage_ongoing
    assert artifact._local_filepath.is_dir()
    assert artifact.otype == "AnnData"
    assert artifact.suffix == ".anndata.zarr"
    assert artifact.n_files >= 1

    ln.save([artifact])  # use bulk save here for testing
    assert not artifact._storage_ongoing
    assert isinstance(artifact.path, CloudPath)
    assert artifact.path.exists()
    cache_path = artifact._cache_path
    assert cache_path.is_dir()

    shutil.rmtree(cache_path)
    assert not cache_path.exists()

    artifact._memory_rep = None

    assert isinstance(artifact.load(), ad.AnnData)
    assert cache_path.is_dir()

    artifact.delete(permanent=True, storage=True)

    ln.settings.storage = previous_storage


================================================
FILE: tests/storage/test_cache.py
================================================
import shutil
from pathlib import Path
from time import sleep

import lamindb as ln
import pytest
from lamindb.core.loaders import load_h5ad
from lamindb_setup._set_managed_storage import set_managed_storage


# https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test
# switch to cloud storage and back
@pytest.fixture
def switch_storage():
    cloud_storage = "s3://lamindb-ci/lamindb-unit-tests-cloud"
    set_managed_storage(cloud_storage)
    yield cloud_storage
    set_managed_storage("./default_storage_unit_storage")


def test_local_cache():
    # check that we have local storage
    local_storage = Path("./default_storage_unit_storage").resolve().as_posix()
    assert ln.setup.settings.storage.root_as_str == local_storage

    test_file = ln.examples.datasets.anndata_file_pbmc68k_test()
    adata = load_h5ad(test_file)

    artifact = ln.Artifact.from_anndata(adata, key="test_cache.h5ad")
    temp_path = artifact._local_filepath.resolve()
    assert temp_path.exists()
    assert ln.setup.settings.cache_dir in temp_path.parents

    artifact.save()
    assert artifact.path.exists()
    assert not temp_path.exists()

    artifact.delete(permanent=True)

    # check directories
    adata_zarr_pth = Path("test_adata.zarr")
    adata.write_zarr(adata_zarr_pth)
    assert adata_zarr_pth.exists()

    artifact = ln.Artifact(adata_zarr_pth, key="test_cache.zarr").save()
    assert adata_zarr_pth.exists()
    assert artifact.path.exists()
    assert artifact.path.name != artifact.key

    shutil.rmtree(adata_zarr_pth)
    artifact.delete(permanent=True)

    # check directories in cache
    cache_dir = ln.setup.settings.cache_dir
    adata_zarr_pth = cache_dir / "test_adata.zarr"
    adata.write_zarr(adata_zarr_pth)

    artifact = ln.Artifact(adata_zarr_pth, key="test_cache.zarr")
    assert adata_zarr_pth.exists()
    artifact.save()

    assert not adata_zarr_pth.exists()
    assert artifact.path.exists()
    assert artifact.path.name != artifact.key

    artifact.delete(permanent=True)


def test_cloud_cache(switch_storage):
    # check that we have cloud storage
    assert ln.setup.settings.storage.root_as_str == switch_storage

    cache_dir = ln.setup.settings.cache_dir
    assert cache_dir is not None

    test_file = ln.examples.datasets.anndata_file_pbmc68k_test()

    # test cache for saving an in-memory object
    adata = load_h5ad(test_file)

    artifact = ln.Artifact.from_anndata(adata, key="test_cache.h5ad")
    temp_path = artifact._local_filepath.resolve()
    assert cache_dir in temp_path.parents
    artifact.save()
    assert not temp_path.exists()
    cloud_path = artifact.path
    cache_path = artifact._cache_path
    assert cache_path.exists()
    assert (
        cache_path == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad"
    )
    assert cloud_path.modified.timestamp() < cache_path.stat().st_mtime

    artifact.delete(permanent=True)

    # test cache for saving an on-disk object
    artifact = ln.Artifact.from_anndata(test_file, key="test_cache.h5ad")
    artifact.save()
    cloud_path = artifact.path
    cache_path = artifact._cache_path
    assert cache_path.exists()
    assert (
        cache_path == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad"
    )
    assert test_file.stat().st_mtime < cache_path.stat().st_mtime
    assert cloud_path.modified.timestamp() < cache_path.stat().st_mtime

    artifact.delete(permanent=True)

    # test cache for a directory on-disk object outside the cache dir
    adata_zarr_pth = Path("test_adata.zarr")
    adata.write_zarr(adata_zarr_pth)
    artifact = ln.Artifact(adata_zarr_pth, key="test_cache.zarr")
    artifact.save()
    assert adata_zarr_pth.is_dir()
    cache_path = artifact._cache_path
    assert cache_path.is_dir()
    assert (
        cache_path == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.zarr"
    )

    shutil.rmtree(adata_zarr_pth)
    artifact.delete(permanent=True)

    # inside the cache dir
    adata_zarr_pth = cache_dir / "test_adata.zarr"
    adata.write_zarr(adata_zarr_pth)
    artifact = ln.Artifact(adata_zarr_pth, key="test_cache.zarr")
    assert adata_zarr_pth.exists()
    artifact.save()
    assert not adata_zarr_pth.exists()
    cache_path = artifact._cache_path
    assert cache_path.is_dir()
    assert (
        cache_path == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.zarr"
    )

    artifact.delete(permanent=True)


def test_cloud_cache_versions(switch_storage):
    adata = load_h5ad(ln.examples.datasets.anndata_file_pbmc68k_test())

    cache_dir = ln.setup.settings.cache_dir
    assert cache_dir is not None

    artifact = ln.Artifact.from_anndata(adata, key="test_cache.h5ad")
    assert ln.settings.cache_dir in artifact._local_filepath.parents
    artifact.save()
    cache_path_v1 = artifact.cache()
    assert cache_path_v1.exists()
    assert (
        cache_path_v1
        == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad"
    )
    cache_path_v1.unlink()
    artifact.cache(print_progress=False)
    assert cache_path_v1.exists()
    assert (
        cache_path_v1
        == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad"
    )
    timestamp_v1 = cache_path_v1.stat().st_mtime
    # hope it is enough to avoid random timestamp problems further
    sleep(1)
    # new version
    adata.obs["test_cache"] = "test"
    artifact_v2 = ln.Artifact.from_anndata(
        adata, key="test_cache.h5ad", revises=artifact
    )
    assert ln.settings.cache_dir in artifact_v2._local_filepath.parents
    artifact_v2.save()
    assert artifact_v2.is_latest
    assert not artifact.is_latest
    cache_path_v2 = artifact_v2.cache()
    assert cache_path_v2.exists()
    assert (
        cache_path_v2
        == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad"
    )
    assert cache_path_v2.stat().st_mtime > timestamp_v1
    cache_path_v2.unlink()
    artifact_v2.cache(mute=True)
    assert cache_path_v2.exists()
    assert (
        cache_path_v2
        == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad"
    )
    assert "test_cache" in load_h5ad(cache_path_v2).obs.columns
    cache_mtime = cache_path_v2.stat().st_mtime
    assert cache_mtime == artifact_v2.path.modified.timestamp()
    assert cache_mtime > timestamp_v1
    # old version cache ignores key
    cache_path_v1 = artifact.cache()
    assert cache_path_v1.exists()
    assert cache_path_v1.name == f"{artifact.uid}.h5ad"

    artifact_v2.versions.delete(permanent=True)


def test_corrupted_cache_local():
    filepath = ln.examples.datasets.anndata_file_pbmc68k_test()
    artifact = ln.Artifact.from_anndata(filepath, key="test_corrupt_cache_local.h5ad")
    artifact.save()
    # corrupt cache
    with open(artifact._cache_path, "r+b") as f:
        f.write(b"corruption")
    # just raises an exception, nothing to re-sync on local
    with pytest.raises(OSError):
        artifact.load()
    with pytest.raises(OSError):
        artifact.open()

    artifact.delete(permanent=True)


def test_corrupted_cache_cloud(switch_storage):
    # check that we have cloud storage
    assert ln.setup.settings.storage.root_as_str == switch_storage

    filepath = ln.examples.datasets.anndata_file_pbmc68k_test()
    artifact = ln.Artifact.from_anndata(filepath, key="test_corrupt_cache_cloud.h5ad")
    artifact.save()
    # corrupt cache
    # sleep not to reset cache mtime to a smaller value
    # it is increased artificially on cache copying in save
    # so due to lower granularity of cloud mtimes and fast code execution
    # after the change cache mtime can become smaller than cloud mtime
    sleep(1)
    with open(artifact._cache_path, "r+b") as f:
        f.write(b"corruption")
    assert artifact._cache_path.stat().st_mtime > artifact.path.stat().st_mtime
    # check that it is indeed corrupted
    with pytest.raises(OSError):
        load_h5ad(artifact.cache())
    # should load successfully
    artifact.load()
    # check open also
    assert artifact._cache_path.exists()
    with open(artifact._cache_path, "r+b") as f:
        f.write(b"corruption")
    # should open successfully
    with artifact.open():
        pass
    # corrupted cache has been deleted
    assert not artifact._cache_path.exists()

    artifact.delete(permanent=True)


================================================
FILE: tests/storage/test_connect_reconnect.py
================================================
import lamindb as ln
import pytest


def test_connect_reconnect():
    # testuser2 needs write access lamin-site-assets because of a fluke
    # in the legacy collaborator management, it seems
    assert ln.setup.settings.user.handle == "testuser2"
    ln.connect("lamindb-unit-tests-storage")  # this is not changing anything
    count1 = ln.Artifact.filter().count()
    # a public instance that does not have bionty configured
    ln.connect("laminlabs/lamin-site-assets")
    count2 = ln.Artifact.filter().count()
    assert count1 != count2
    with pytest.raises(ln.setup.errors.ModuleWasntConfigured):
        import bionty as bt
    ln.connect("lamindb-unit-tests-storage")
    import bionty as bt

    count3 = bt.Gene.filter().count()
    assert count2 != count3


================================================
FILE: tests/storage/test_storage_lifecycle.py
================================================
from pathlib import Path

import lamindb as ln
import pytest
from lamindb_setup.core._hub_core import get_storage_records_for_instance


def check_storage_location_on_hub_exists(uid: str):
    all_storage_records = get_storage_records_for_instance(
        ln.setup.settings.instance._id
    )
    length = len([r for r in all_storage_records if r["lnid"] == uid])
    if length not in {0, 1}:
        raise AssertionError(
            f"Expected 0 or 1 storage records for uid {uid}, found {length}."
        )
    return length == 1


def test_reference_storage_location(ccaplog):
    ln.Artifact("s3://lamindata/iris_studies/study0_raw_images")
    assert ln.Storage.get(root="s3://lamindata").instance_uid == "4XIuR0tvaiXM"
    # assert (
    #     "referenced read-only storage location at s3://lamindata, is managed by instance with uid 4XIuR0tvaiXM"
    #     in ccaplog.text
    # )


def test_switch_delete_storage_location():
    ln.settings.storage = "./default_storage_unit_storage"
    assert (
        ln.settings.storage.root.resolve()
        == Path("./default_storage_unit_storage").resolve()
    )
    new_storage_location = "s3://lamindb-ci/test-settings-switch-storage"
    ln.Storage(new_storage_location).save()
    ln.settings.storage = new_storage_location
    assert ln.setup.settings.storage.type_is_cloud
    assert ln.setup.settings.storage.root_as_str == new_storage_location
    # root.fs contains the underlying fsspec filesystem
    # the following is set by lamindb to True for s3 by default
    assert ln.setup.settings.storage.root.fs.cache_regions
    ln.settings.storage = new_storage_location, {"cache_regions": False}
    assert not ln.setup.settings.storage.root.fs.cache_regions
    assert ln.setup.settings.storage.root.exists()

    # now work with the new storage location
    new_storage = ln.Storage.get(root=new_storage_location)
    assert check_storage_location_on_hub_exists(new_storage.uid)
    artifact = ln.Artifact(".gitignore", key="test_artifact").save()
    assert new_storage.root in artifact.path.as_posix()

    # artifacts exist
    with pytest.raises(AssertionError) as err:
        new_storage.delete()
    assert "Cannot delete storage with artifacts in current instance." in err.exconly()

    artifact.delete(permanent=True, storage=False)
    # still some files in there
    with pytest.raises(ln.setup.errors.StorageNotEmpty) as err:
        new_storage.delete()
    assert (
        "'s3://lamindb-ci/test-settings-switch-storage/.lamindb' contains 1 objects"
        in err.exconly()
    )

    # now delete the artifact so that the storage location is empty
    artifact.path.unlink()
    with pytest.raises(AssertionError) as err:
        new_storage.delete()
    assert (
        "Cannot delete the current storage location, switch to another."
        in err.exconly()
    )

    # check all attempts unsuccessful so far
    assert check_storage_location_on_hub_exists(new_storage.uid)

    # switch back to default storage
    ln.settings.storage = "./default_storage_unit_storage"
    storage_marker = ln.UPath(new_storage_location) / ".lamindb/storage_uid.txt"
    assert storage_marker.exists()
    new_storage.delete()
    assert not check_storage_location_on_hub_exists(new_storage.uid)
    assert not storage_marker.exists()


================================================
FILE: tests/storage/test_streaming.py
================================================
import gzip
import shutil
from pathlib import Path

import anndata as ad
import h5py
import lamindb as ln
import numpy as np
import pandas as pd
import pytest
import zarr
from lamindb.core.loaders import load_h5ad
from lamindb.core.storage._anndata_accessor import _anndata_n_observations, _to_index
from lamindb.core.storage._backed_access import (
    _flat_suffixes,
    backed_access,
)
from lamindb.core.storage._polars_lazy_df import _open_polars_lazy_df, _polars_options
from lamindb.core.storage._pyarrow_dataset import _open_pyarrow_dataset
from lamindb.core.storage._zarr import load_zarr
from lamindb.core.storage.objects import infer_suffix, write_to_disk


@pytest.fixture
def bad_adata_path():
    fp = ln.examples.datasets.anndata_file_pbmc68k_test()
    adata = load_h5ad(fp)
    to = fp.with_name("pbmc68k_bad.h5ad")
    shutil.copy(fp, to)
    fp = to
    file = h5py.File(fp, mode="r+")
    for field_name in ("obs", "var"):
        field = getattr(adata, field_name).to_records()
        formats = []
        for name, (dt, _) in field.dtype.fields.items():
            if dt == "O":
                new_dt = str(field[name].astype(str).dtype).replace("<U", "S")
            else:
                new_dt = dt
            formats.append((name, new_dt))
        del file[field_name]
        file.create_dataset(field_name, data=field.astype(formats))
    del file["X"].attrs["encoding-type"]
    del file["X"].attrs["encoding-version"]
    del file["obsp"]["test"].attrs["encoding-type"]
    del file["obsp"]["test"].attrs["encoding-version"]
    file.close()
    return fp


def test_anndata_io():
    test_file = ln.examples.datasets.anndata_file_pbmc68k_test()

    adata = load_h5ad(test_file)

    zarr_path = test_file.with_suffix(".zarr")
    adata.write_zarr(zarr_path)

    adata = load_zarr(zarr_path, "anndata")

    assert adata.shape == (30, 200)

    shutil.rmtree(zarr_path)


@pytest.mark.parametrize("adata_format", ["h5ad", "zarr"])
def test_backed_access(adata_format):
    fp = ln.UPath(ln.examples.datasets.anndata_file_pbmc68k_test())
    if adata_format == "zarr":
        adata = load_h5ad(fp)

        fp = fp.with_suffix(".zarr")
        adata.write_zarr(fp)
        del adata
        # remove encoding information to check correctness of backed accessor
        store = zarr.open(fp)
        del store["obsp"]["test"].attrs["encoding-type"]
        del store["obsp"]["test"].attrs["encoding-version"]
        del store["obsm"]["X_pca"].attrs["encoding-type"]
        del store["obsm"]["X_pca"].attrs["encoding-version"]
        del store

    with pytest.raises(ValueError):
        access = backed_access(fp.with_suffix(".invalid_suffix"), using_key=None)

    # can't open anndata in write mode
    with pytest.raises(ValueError):
        access = backed_access(fp, mode="a", using_key=None)

    access = backed_access(fp, using_key=None)
    assert not access.closed

    assert isinstance(access.obs_names, pd.Index)
    assert isinstance(access.var_names, pd.Index)
    assert access.raw.shape == (30, 100)
    assert access.obsp["test"].to_memory().sum() == 30
    assert access.varp["test"].to_memory().sum() == 200
    assert access.layers["test"][0].sum() == 200

    mask = np.full(access.shape[0], False, dtype=bool)
    mask[:5] = True
    assert access[mask].X.shape == (5, 200)

    sub = access[:10]
    assert sub[:5].shape == (5, 200)
    assert sub.layers["test"].shape == sub.shape
    assert sub.raw.shape == (10, 100)
    assert sub.obsp["test"].sum() == 10
    assert sub.varp["test"].sum() == 200
    assert sub.obsm["X_pca"].shape == (10, 50)

    with pytest.raises(AttributeError):
        sub.raw.raw  # noqa: B018

    assert access[:, [1, 2, 5]].varp["test"].sum() == 3

    obs_sub = ["TCAATCACCCTTCG-8", "CGTTATACAGTACC-8", "TGCCAAGATTGTGG-7"]
    sub = access[obs_sub]
    assert sub.obs_names.tolist() == obs_sub
    assert sub.to_memory().shape == (3, 200)

    # check with a bool mask
    obs_mask = np.isin(access.obs_names, obs_sub)
    sub = access[obs_mask]
    assert sub.obs_names.tolist() == obs_sub
    assert sub.to_memory().shape == (3, 200)

    idx = np.array([1, 2, 5])
    sub = access[idx]
    assert sub.raw.shape == (3, 100)
    assert sub.to_memory().shape == (3, 200)

    var_sub = ["SSU72", "PARK7", "RBP7"]
    sub = access[:, var_sub]
    assert sub.var_names.tolist() == var_sub

    assert access.to_memory().shape == (30, 200)
    assert sub.to_memory().shape == (30, 3)

    access.close()
    assert access.closed
    del access

    with backed_access(fp, using_key=None) as access:
        assert not access.closed
        sub = access[:10]
        assert sub[:5].shape == (5, 200)
        assert sub.layers["test"].shape == sub.shape
    assert access.closed

    with backed_access(fp, using_key=None) as access:
        idx = np.array([3, 1, 2])
        assert access[:, idx].to_memory().shape == (30, 3)
        assert access[idx].to_memory().shape == (3, 200)

    if adata_format == "zarr":
        assert fp.suffix == ".zarr"
        shutil.rmtree(fp)


def test_add_column():
    previous_storage = ln.setup.settings.storage.root_as_str
    ln.settings.storage = "s3://lamindb-test/storage"

    adata = load_h5ad(ln.examples.datasets.anndata_file_pbmc68k_test())
    zarr_path = "adata_write_mode.zarr"
    adata.write_zarr(zarr_path)

    artifact = ln.Artifact(zarr_path, description="test add_column").save()

    access = artifact.open(mode="r+")
    n_obs, n_var = access.shape
    access.add_column("obs", "ones_obs", np.ones(n_obs))
    access.add_column("var", "ones_var", np.ones(n_var))
    assert np.all(access.obs["ones_obs"] == 1)
    assert np.all(access.var["ones_var"] == 1)
    access.close()
    assert artifact.uid.endswith("0001")

    cat_col = pd.Categorical(["one"] + ["two"] * (n_obs - 1))
    with artifact.open(mode="r+") as access:
        access.add_column("obs", "cat_col", cat_col)
        assert access.obs["cat_col"].cat.categories.to_list() == ["one", "two"]
    assert artifact.uid.endswith("0002")
    # can't add in read mode
    with pytest.raises(ValueError):
        artifact.open().add_column("obs", "new_col", cat_col)

    artifact.delete(permanent=True)
    shutil.rmtree(zarr_path)

    ln.settings.storage = previous_storage


def test_to_index():
    elem_int = np.arange(3, dtype=int)
    elem_float = elem_int.astype(float)
    elem_str = elem_int.astype(str)

    assert _to_index(elem_int).dtype == "object"
    assert _to_index(elem_float).dtype == "object"
    assert _to_index(elem_str).dtype == "object"


def test_infer_suffix():
    adata = ad.AnnData()
    assert infer_suffix(adata, format="h5ad") == ".h5ad"
    with pytest.raises(ValueError):
        infer_suffix(adata, format="my format")
    with pytest.raises(NotImplementedError):
        infer_suffix(ln.Artifact)


def test_write_to_disk():
    with pytest.raises(NotImplementedError):
        write_to_disk(ln.Artifact, "path")

    df = pd.DataFrame({"x": [1, 2], "y": [3, 4]})
    write_to_disk(df, "write_to_disk.csv")

    file_on_disk = Path("write_to_disk.csv")
    assert file_on_disk.exists()

    file_on_disk.unlink()


def test_backed_bad_format(bad_adata_path):
    access = backed_access(bad_adata_path, using_key=None)

    assert access.obsp["test"].to_memory().sum() == 30

    sub = access[:10]

    assert sub.X.shape == (10, 200)
    assert sub.obsp["test"].sum() == 10

    assert isinstance(sub.obs, pd.DataFrame)
    assert isinstance(sub.var, pd.DataFrame)
    assert isinstance(sub.obs_names, pd.Index)
    assert isinstance(sub.var_names, pd.Index)

    assert sub.to_memory().shape == (10, 200)

    access.close()
    bad_adata_path.unlink()


def test_backed_zarr_not_adata():
    zarr_pth = Path("./not_adata.zarr")
    store = zarr.open(zarr_pth, mode="w")
    store["test"] = np.array(["test"])

    access = backed_access(zarr_pth)

    assert type(access).__name__ == "BackedAccessor"
    assert type(access).__module__ == "lamindb.core.storage._backed_access"
    assert access.storage["test"][...] == "test"

    shutil.rmtree(zarr_pth)


def test_anndata_open_mode():
    fp = ln.examples.datasets.anndata_file_pbmc68k_test()
    artifact = ln.Artifact(fp, key="test_adata.h5ad").save()

    with artifact.open(mode="r") as access:
        # TODO: add back proper type checking once reset_django() is gone
        assert type(access).__name__ == "AnnDataAccessor"
        assert type(access).__module__ == "lamindb.core.storage._anndata_accessor"
    # can't open in write mode if not tiledbsoma
    with pytest.raises(ValueError):
        artifact.open(mode="w")

    artifact.delete(permanent=True, storage=True)


def test_from_lazy():
    # a different suffix in key
    with pytest.raises(ValueError):
        ln.Artifact.from_lazy(
            suffix=".zarr", overwrite_versions=True, key="mydata.h5ad"
        )

    lazy = ln.Artifact.from_lazy(
        suffix=".zarr", overwrite_versions=True, key="mydata.zarr"
    )

    store = zarr.open(lazy.path, mode="w")
    store["test"] = np.array(["test"])

    artifact = lazy.save()

    path_str = artifact.path.as_posix()
    assert ".lamindb" in path_str
    assert artifact.uid[:16] in path_str

    access = artifact.open()
    assert access.storage["test"][...] == "test"

    artifact.delete(permanent=True, storage=True)


def test_zarr_open_mode_overwrite_versions_false():
    lazy = ln.Artifact.from_lazy(
        suffix=".zarr", overwrite_versions=False, key="mydata_overwrite_false.zarr"
    )
    store = zarr.open(lazy.path, mode="w")
    store["test"] = np.array(["test"])
    artifact = lazy.save()

    with pytest.raises(ValueError, match="overwrite_versions=False"):
        artifact.open(mode="r+")

    artifact.delete(permanent=True, storage=True)


def test_from_lazy_cloud():
    previous_storage = ln.setup.settings.storage.root_as_str
    ln.settings.storage = "s3://lamindb-test/storage"
    lazy = ln.Artifact.from_lazy(
        suffix=".zarr", overwrite_versions=True, key="stream_test.zarr"
    )
    store = zarr.storage.FsspecStore.from_url(lazy.path.as_posix())
    group = zarr.open(store, mode="w")
    group["ones"] = np.ones(3)
    artifact = lazy.save()
    access = artifact.open()
    np.testing.assert_array_equal(access.storage["ones"][...], np.ones(3))
    artifact.delete(permanent=True, storage=True)
    ln.settings.storage = previous_storage


def test_polars_options():
    storepath = ln.UPath(
        "s3://bucket/key?endpoint_url=http://localhost:9000/s3", anon=True
    )
    storage_options = _polars_options(storepath)["storage_options"]
    assert storage_options["aws_endpoint_url"] == "http://localhost:9000/s3"
    assert not storage_options["aws_virtual_hosted_style_request"]
    assert storage_options["aws_allow_http"]
    assert storage_options["aws_skip_signature"]


def test_open_dataframe_artifact():
    previous_storage = ln.setup.settings.storage.root_as_str
    ln.settings.storage = "s3://lamindb-test/storage"
    # open from managed bucket
    artifact_remote = ln.Artifact.connect("laminlabs/lamin-dev").get(
        "iw9RRhFApeJVHC1L0001"
    )
    with artifact_remote.open(engine="polars") as ldf:
        assert ldf.collect().shape == (3, 5)
    # test passing credentials directly
    artifact_path = artifact_remote.path
    aws_key = artifact_path.fs.session._credentials._access_key
    aws_secret = artifact_path.fs.session._credentials._secret_key
    aws_token = artifact_path.fs.session._credentials._token
    test_path = ln.UPath(
        artifact_path.as_posix(),
        key=aws_key,
        secret=aws_secret,
        token=aws_token,
    )
    with _open_polars_lazy_df(test_path) as ldf:
        assert ldf.collect().shape == (3, 5)

    df = pd.DataFrame({"feat1": [0, 0, 1, 1], "feat2": [6, 7, 8, 9]})
    # check as non-partitioned file
    df.to_parquet("save_df.parquet", engine="pyarrow")
    artifact_file = ln.Artifact(
        "save_df.parquet", description="Test non-partitioned parquet"
    )
    artifact_file.save()
    # cached after saving
    ds = artifact_file.open()
    assert ds.to_table().to_pandas().equals(df)
    # remove cache
    artifact_file.cache().unlink()
    # pyarrow
    ds = artifact_file.open(engine="pyarrow")
    assert ds.to_table().to_pandas().equals(df)
    # polars
    with artifact_file.open(engine="polars") as ldf:
        assert ldf.collect().to_pandas().equals(df)
    # wrong engine
    with pytest.raises(ValueError) as err:
        artifact_file.open(engine="some-other-engine")
    assert err.exconly().startswith("ValueError: Unknown engine")
    # check as partitioned folder
    df.to_parquet("save_df", engine="pyarrow", partition_cols=["feat1"])
    assert Path("save_df").is_dir()
    artifact_folder = ln.Artifact("save_df", description="Test partitioned parquet")
    artifact_folder.save()
    # cached after saving
    ds = artifact_folder.open()
    assert ds.to_table().to_pandas().equals(df[["feat2"]])
    # remove cache
    shutil.rmtree(artifact_folder.cache())
    # pyarrow
    ds = artifact_folder.open()
    assert ds.to_table().to_pandas().equals(df[["feat2"]])
    # polars
    with artifact_folder.open(engine="polars") as ldf:
        assert ldf.collect().to_pandas().equals(df[["feat2"]])
    with artifact_folder.open(engine="polars", use_fsspec=True) as ldf:
        assert ldf.collect().to_pandas().equals(df[["feat2"]])

    artifact_file.delete(permanent=True)
    artifact_folder.delete(permanent=True)

    ln.settings.storage = previous_storage


def test_open_dataframe_collection():
    previous_storage = ln.setup.settings.storage.root_as_str
    ln.settings.storage = "s3://lamindb-test/storage"

    df = pd.DataFrame({"feat1": [0, 0, 1, 1], "feat2": [6, 7, 8, 9]})
    shard1 = ln.UPath("df1.parquet")
    shard2 = ln.UPath("df2.parquet")
    df[:2].to_parquet(shard1, engine="pyarrow")
    df[2:].to_parquet(shard2, engine="pyarrow")
    # test checking and opening local paths
    assert _flat_suffixes(shard1) == {".parquet"}
    assert _flat_suffixes([shard1, ln.UPath("some.csv")]) == {".parquet", ".csv"}
    assert _open_pyarrow_dataset([shard1, shard2]).to_table().to_pandas().equals(df)

    ln.examples.datasets.file_mini_csv()

    artifact1 = ln.Artifact(shard1, key="df1.parquet").save()
    artifact2 = ln.Artifact(shard2, key="df2.parquet").save()
    artifact3 = ln.Artifact("mini.csv", key="mini.csv").save()
    artifact4 = ln.Artifact(
        "https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv"
    ).save()

    collection1 = ln.Collection([artifact1, artifact2], key="parquet_col")
    # before saving
    # engine="pyarrow" by default
    assert collection1.open().to_table().to_pandas().equals(df)
    # after saving
    collection1.save()
    # pyarrow
    assert collection1.open(engine="pyarrow").to_table().to_pandas().equals(df)
    # polars
    with collection1.open(engine="polars") as ldf:
        assert ldf.collect().to_pandas().equals(df)
    with collection1.open(engine="polars", use_fsspec=True) as ldf:
        assert ldf.collect().to_pandas().equals(df)
    # wrong engine
    with pytest.raises(ValueError) as err:
        collection1.open(engine="some-other-engine")
    assert err.exconly().startswith("ValueError: Unknown engine")
    # different file formats
    collection2 = ln.Collection([artifact1, artifact3], key="parquet_csv_col").save()
    with pytest.raises(ValueError) as err:
        collection2.open()
    assert err.exconly().startswith(
        "ValueError: The artifacts in the collection have different file formats"
    )
    # different filesystems with pyarrow
    collection3 = ln.Collection([artifact3, artifact4], key="s3_http_col").save()
    with pytest.raises(ValueError) as err:
        collection3.open()
    assert err.exconly().startswith(
        "ValueError: The collection has artifacts with different filesystems, this is not supported"
    )

    shard1.unlink()
    shard2.unlink()

    collection1.delete(permanent=True)
    collection2.delete(permanent=True)
    collection3.delete(permanent=True)

    artifact1.delete(permanent=True)
    artifact2.delete(permanent=True)
    artifact3.delete(permanent=True)
    artifact4.delete(permanent=True, storage=False)

    Path("mini.csv").unlink(missing_ok=True)

    ln.settings.storage = previous_storage


def test_backed_wrong_suffix():
    fp = Path("test_file.txt")
    fp.write_text("test open with wrong suffix")

    artifact = ln.Artifact(fp, description="Test open wrong suffix")
    # do not save here, it just tries to open the local path
    with pytest.raises(ValueError):
        artifact.open()

    fp.unlink()


def test_anndata_n_observations(bad_adata_path):
    assert _anndata_n_observations(bad_adata_path) == 30

    assert _anndata_n_observations("./path_does_not_exist.h5ad") is None
    assert _anndata_n_observations("./path_does_not_exist.zarr") is None

    corrupted_path = Path("./corrupted.h5ad")
    shutil.copy(bad_adata_path, corrupted_path)
    with h5py.File(corrupted_path, mode="r+") as f:
        del f["obs"]
        assert "obs" not in f
    assert _anndata_n_observations(corrupted_path) is None
    corrupted_path.unlink()

    adata = ln.examples.datasets.anndata_pbmc68k_reduced()
    assert _anndata_n_observations(adata) == adata.n_obs
    zarr_path = "./test_adata_n_obs.zarr"
    adata.write_zarr(zarr_path)
    assert _anndata_n_observations(zarr_path) == adata.n_obs

    del zarr.open(zarr_path, mode="r+")["obs"].attrs["_index"]
    assert _anndata_n_observations(zarr_path) == adata.n_obs

    shutil.rmtree(zarr_path)


def _compress(input_filepath, output_filepath):
    with open(input_filepath, "rb") as f_in:
        with gzip.open(output_filepath, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)


@pytest.mark.parametrize("gz_suffix", [".gz", ".tar.gz"])
def test_compressed(gz_suffix):
    adata_f = ln.examples.datasets.anndata_file_pbmc68k_test()
    adata_gz = adata_f.with_suffix(adata_f.suffix + gz_suffix)
    _compress(adata_f, adata_gz)

    artifact = ln.Artifact.from_anndata(adata_gz, key="adata.h5ad" + gz_suffix).save()
    assert artifact.n_observations == 30

    with artifact.open() as store:
        assert type(store).__name__ == "AnnDataAccessor"
        assert type(store).__module__ == "lamindb.core.storage._anndata_accessor"

    assert isinstance(artifact.load(), ad.AnnData)

    with pytest.raises(OSError):
        artifact.open(compression=None)

    artifact.delete(permanent=True)
    adata_gz.unlink()


================================================
FILE: tests/storage/test_transfer.py
================================================
from unittest.mock import patch

import bionty as bt
import lamindb as ln
import pytest
from lamindb.models._django import get_artifact_or_run_with_related


def test_describe_artifact_from_remote_instance(capsys):
    # test describing from a remote instance with less modules
    artifact = ln.Artifact.connect("laminlabs/lamin-site-assets").first()
    artifact.describe()
    captured = capsys.readouterr()
    assert len(captured.out) > 50
    assert "artifact" in captured.out.lower()


def test_transfer_from_remote_to_local(ccaplog):
    """Test transfer from remote to local instance."""

    bt.Gene.filter().delete(permanent=True)
    bt.Organism.filter().delete(permanent=True)
    ln.ULabel.filter().delete(permanent=True)
    bt.CellType.filter().delete(permanent=True)

    # test transfer from an instance with an extra schema module: pertdb
    # we also made sure that the artifact here has a pertdb label attached

    # transfer 1st artifact
    artifact1 = ln.Artifact.connect("laminlabs/lamin-dev").get("livFRRpM")

    # test describe postgres
    result = get_artifact_or_run_with_related(
        artifact1,
        include_m2m=True,
        include_fk=True,
        include_feature_link=True,
        include_schema=True,
    )
    assert result["related_data"]["m2m"]["tissues"] == {
        2: {
            "id": 2,
            "uid": "6VHBo6XsJZqmaQ",
            "abbr": None,
            "name": "cortex of kidney",
            "tissue": 2,
            "feature": None,
            "ontology_id": "UBERON:0001225",
            "tissue_display": "cortex of kidney",
        }
    }
    assert sorted(
        result["related_data"]["link"]["links_ulabel"], key=lambda d: d["id"]
    ) == [
        {
            "id": 7,
            "uid": "ydyPUMjh",
            "name": "donor_24",
            "ulabel": 15,
            "feature": 1,
            "reference": None,
            "reference_type": None,
            "ulabel_display": "donor_24",
        },
        {
            "id": 8,
            "uid": "JJ3d8a2v",
            "name": "na",
            "ulabel": 10,
            "feature": 10,
            "reference": None,
            "reference_type": None,
            "ulabel_display": "na",
        },
    ]
    assert result["related_data"]["m2m_schemas"][615][0] == "obs"
    assert result["related_data"]["m2m_schemas"][615][1] == {
        "Feature": [
            "donor_id",
            "development_stage",
            "disease",
            "cell_type",
            "sex",
            "assay",
            "tissue",
            "self_reported_ethnicity",
            "tissue_type",
            "suspension_type",
            "organism",
        ]
    }
    assert result["related_data"]["fk"]["storage"] == {
        "id": 4,
        "name": "s3://cellxgene-data-public",
    }

    id_remote = artifact1.id
    run_remote = artifact1.run
    transform_remote = artifact1.transform
    created_by_remote = artifact1.created_by
    storage_remote = artifact1.storage
    organism_remote = artifact1.organisms.get(name="human")

    artifact1.save(transfer="annotations")
    # assert MODULE_WASNT_CONFIGURED_MESSAGE_TEMPLATE.format("pertdb") in ccaplog.text

    # check all ids are adjusted
    assert id_remote != artifact1.id
    assert run_remote != artifact1.run
    assert transform_remote != artifact1.transform
    assert created_by_remote.handle != artifact1.created_by.handle
    assert storage_remote.uid == artifact1.storage.uid
    assert storage_remote.created_at == artifact1.storage.created_at
    organism = artifact1.organisms.get(name="human")
    assert organism.created_at != organism_remote.created_at

    # now check that this is idempotent and we can run it again
    artifact_repeat = ln.Artifact.connect("laminlabs/lamin-dev").get(
        "livFRRpMaOgb3y8U2mK2"
    )
    artifact_repeat.save(transfer="annotations")

    # now prepare a new test case
    # mimic we have an existing feature with a different uid but same name
    feature = ln.Feature.get(name="organism")
    feature.uid = "existing"
    feature.save()

    # transfer 2nd artifact
    artifact2 = ln.Artifact.connect("laminlabs/lamin-dev").get("qz35YaRk")
    artifact2.save(transfer="annotations")

    # check the feature name
    assert artifact2.organisms.get(name="mouse")
    assert (
        artifact1.features.slots["obs"].members.get(name="organism").uid == "existing"
    )

    # test transfer from an instance with fewer modules (laminlabs/lamin-site-assets)
    artifact3 = ln.Artifact.connect("laminlabs/lamin-site-assets").get(
        "lgRNHNtMxjU0y8nIagt7"
    )
    # test that implicit saving through `load()` works (also occurs for `cache()` or `open()` for run input tracking)
    artifact3.load()

    # delete with storage=False, because these are all stored in the source instances
    artifact1.delete(storage=False, permanent=True)
    artifact2.delete(storage=False, permanent=True)
    artifact3.delete(
        storage=False
    )  # there is an issue here with permanent deletion because of schema module mismatch


def test_transfer_into_space():
    # grab any ulabel from the default space
    ulabel = ln.ULabel.connect("laminlabs/lamin-dev").filter(space__id=1).first()

    space = ln.Space(name="space for transfer", uid="00000123").save()
    with patch.object(ln.context, "_space", new=space):
        ulabel.save()
    assert ulabel.space_id == space.id

    ulabel.delete(permanent=True)
    space.delete()


def test_using_record_organism():
    """Test passing record and organism to the using_key instance."""
    import bionty as bt

    release_110_cxg = bt.Source.connect("laminlabs/lamin-dev").get(
        organism="mouse", entity="bionty.Gene", version="release-110"
    )
    release_112_cxg = bt.Source.connect("laminlabs/lamin-dev").get(
        organism="mouse", entity="bionty.Gene", version="release-112"
    )
    release_110 = release_110_cxg.save()  # transfer source record
    release_110_cxg = (  # re-fetch
        bt.Source.connect("laminlabs/lamin-dev").get(
            organism="mouse", entity="bionty.Gene", version="release-110"
        )
    )

    # passing the wrong source
    inspector = bt.Gene.connect("laminlabs/lamin-dev").inspect(
        ["ENSMUSG00000102862", "ENSMUSG00000084826"],
        field=bt.Gene.ensembl_gene_id,
        source=release_112_cxg,
        strict_source=True,
    )
    assert len(inspector.validated) == 0

    # passing the correct source
    inspector = bt.Gene.connect("laminlabs/lamin-dev").inspect(
        ["ENSMUSG00000102862", "ENSMUSG00000084826"],
        field=bt.Gene.ensembl_gene_id,
        source=release_110_cxg,
        strict_source=True,
    )
    assert len(inspector.validated) == 2

    # passing the correct source but from the wrong instance
    with pytest.raises(ValueError) as error:
        inspector = bt.Gene.connect("laminlabs/lamin-dev").inspect(
            ["ENSMUSG00000102862", "ENSMUSG00000084826"],
            field=bt.Gene.ensembl_gene_id,
            source=release_110,
        )
    assert (
        "record must be a bionty.Source record from instance 'laminlabs/lamin-dev'"
        in str(error.value)
    )


def test_using_query_by_feature():
    assert ln.Artifact.connect("laminlabs/cellxgene").filter(n_of_donors__gte=100)


# TODO: uncomment after migrations
# def test_transfer_features_uid():
#     """Test that a new feature is created based on uid."""
#     existing_tissue_feature = (
#         ln.Feature.connect("laminlabs/lamin-dev").get(name="tissue").save()
#     )
#     artifact = ln.Artifact.connect("laminlabs/pertdata").get("aT2dp4hC6XDwrafN")
#     artifact.save(transfer="annotations")
#     # now a new feature called "tissue" is created because the uid is different
#     newly_transferred_tissue_feature = ln.Feature.get(
#         name="tissue", schemas__artifacts__uid=artifact.uid
#     )
#     assert existing_tissue_feature.uid != newly_transferred_tissue_feature.uid


================================================
FILE: tests/tiledbsoma/conftest.py
================================================
import os
import shutil
from pathlib import Path
from time import perf_counter

import lamindb as ln
import lamindb_setup as ln_setup
import numpy as np
import pandas as pd
import pytest
from lamin_utils import logger


def pytest_sessionstart():
    t_execute_start = perf_counter()
    ln_setup._TESTING = True
    os.environ["LAMIN_TESTING"] = "true"
    os.environ["LAMINDB_TEST_DB_VENDOR"] = "sqlite"
    print("running tests on SQLite")
    ln.setup.init(
        storage="./default_storage_tiledbsoma",
        modules="bionty",
        name="lamindb-unit-tests-tiledbsoma",
    )
    ln.settings.creation.artifact_silence_missing_run_warning = True
    # Pre-register remote roots used in tests so `ln.settings.storage = ...`
    # doesn't prompt for interactive confirmation under pytest capture.
    ln.Storage("s3://lamindb-test/tiledbsoma").save()
    total_time_elapsed = perf_counter() - t_execute_start
    print(f"time to setup the instance: {total_time_elapsed:.1f}s")


def pytest_sessionfinish(session: pytest.Session):
    logger.set_verbosity(1)
    if Path("./default_storage_tiledbsoma").exists():
        shutil.rmtree("./default_storage_tiledbsoma")
    upath = ln_setup.core.upath.UPath("s3://lamindb-test/tiledbsoma")
    if upath.exists():
        upath.rmdir()
    ln.setup.delete("lamindb-unit-tests-tiledbsoma", force=True)
    del os.environ["LAMIN_TESTING"]


@pytest.fixture(scope="session")
def adata_file():
    import anndata as ad

    adata = ad.AnnData(
        X=np.array([[1, 2, 3], [4, 5, 6]]),
        obs={"feat1": ["A", "B"]},
        var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]),
        obsm={"X_pca": np.array([[1, 2], [3, 4]])},
    )
    filepath = Path("adata_file.h5ad")
    adata.write(filepath)
    yield "adata_file.h5ad"
    filepath.unlink(missing_ok=True)


@pytest.fixture(scope="function")
def clean_soma_files(request):
    path = request.param if hasattr(request, "param") else "small_dataset.tiledbsoma"
    if Path(path).exists():
        shutil.rmtree(path)

    yield path

    if Path(path).exists():
        shutil.rmtree(path)


@pytest.fixture(scope="function")
def soma_experiment_file(clean_soma_files):
    import tiledbsoma.io

    adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData")
    tiledbsoma.io.from_anndata("test.tiledbsoma", adata, measurement_name="RNA")
    yield "test.tiledbsoma"
    if Path("test.tiledbsoma").exists():
        shutil.rmtree("test.tiledbsoma")


================================================
FILE: tests/tiledbsoma/test_artifact_basics.py
================================================
import lamindb as ln
import pytest
from lamindb.models.artifact import data_is_soma_experiment


def test_create_from_soma_experiment(soma_experiment_file, adata_file):
    with pytest.raises(ValueError) as error:
        ln.Artifact.from_tiledbsoma(adata_file, description="test1")
    assert (
        "data has to be a SOMA Experiment object or a path to SOMA Experiment store."
        in error.exconly()
    )

    af = ln.Artifact.from_tiledbsoma(soma_experiment_file, description="test1")
    assert af.description == "test1"
    assert af.key is None
    assert af.otype == "tiledbsoma"
    assert af.n_observations == 3


def test_data_is_soma_experiment_paths():
    assert data_is_soma_experiment("something.tiledbsoma")
    assert data_is_soma_experiment(ln.UPath("something.tiledbsoma"))


def test_data_is_soma_experiment(soma_experiment_file):
    import tiledbsoma

    with tiledbsoma.Experiment.open(soma_experiment_file) as store:
        assert data_is_soma_experiment(store)


================================================
FILE: tests/tiledbsoma/test_curators.py
================================================
import shutil

import bionty as bt
import lamindb as ln
import pytest
import tiledbsoma
import tiledbsoma.io


def test_tiledbsoma_curator(clean_soma_files):
    """Test TiledbSomaExperimentCurator with schema."""
    obs_schema = ln.Schema(
        features=[
            ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
            ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
        ],
    ).save()

    var_schema = ln.Schema(
        features=[
            ln.Feature(name="var_id", dtype=bt.Gene.ensembl_gene_id).save(),
        ],
        coerce=True,
    ).save()

    soma_schema = ln.Schema(
        otype="tiledbsoma",
        slots={
            "obs": obs_schema,
            "ms:RNA": var_schema,
        },
    ).save()

    # Convert AnnData to SOMA format
    adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData")
    tiledbsoma.io.from_anndata(
        "small_dataset.tiledbsoma", adata, measurement_name="RNA"
    )

    # Test with invalid dataset
    with pytest.raises(ln.errors.InvalidArgument) as e:
        ln.curators.TiledbsomaExperimentCurator(adata, soma_schema)
    assert "dataset must be SOMAExperiment-like." in str(e.value)

    # Test with invalid schema
    with tiledbsoma.Experiment.open("small_dataset.tiledbsoma") as experiment:
        with pytest.raises(ln.errors.InvalidArgument) as e:
            ln.curators.TiledbsomaExperimentCurator(experiment, schema=var_schema)
        assert "Schema otype must be 'tiledbsoma'." in str(e.value)

    with tiledbsoma.Experiment.open("small_dataset.tiledbsoma") as experiment:
        curator = ln.curators.TiledbsomaExperimentCurator(experiment, soma_schema)

        assert "obs" in curator.slots
        assert "ms:RNA" in curator.slots

        curator.validate()

        artifact = curator.save_artifact(
            key="examples/soma_experiment.tiledbsoma",
            description="SOMA experiment with schema validation",
        )

        assert artifact.schema == soma_schema
        assert "obs" in artifact.features.slots
        assert "ms:RNA" in artifact.features.slots

        # Check feature values are properly annotated
        assert set(artifact.features.get_values()["cell_type_by_expert"]) == {
            "CD8-positive, alpha-beta T cell",
            "B cell",
        }
        assert set(artifact.features.get_values()["cell_type_by_model"]) == {
            "T cell",
            "B cell",
        }

    # Altered data (gene typo)
    adata_typo = ln.examples.datasets.mini_immuno.get_dataset1(
        otype="AnnData", with_gene_typo=True
    )
    typo_soma_path = "./mini_immuno_dataset1_typo.tiledbsoma"
    tiledbsoma.io.from_anndata(typo_soma_path, adata_typo, measurement_name="RNA")
    with tiledbsoma.Experiment.open(typo_soma_path) as experiment_typo:
        curator_typo = ln.curators.TiledbsomaExperimentCurator(
            experiment_typo, soma_schema
        )

        # Validation should fail due to typo
        with pytest.raises(ln.errors.ValidationError) as error:
            curator_typo.validate()
        assert "GeneTypo" in str(error.value)

    # Clean up
    shutil.rmtree(typo_soma_path)
    artifact.delete(permanent=True)
    soma_schema.delete(permanent=True)
    var_schema.delete(permanent=True)
    obs_schema.delete(permanent=True)


================================================
FILE: tests/tiledbsoma/test_storage.py
================================================
import shutil
from pathlib import Path

import lamindb as ln
import numpy as np
import pytest
import tiledbsoma
import tiledbsoma.io
from lamindb.core.loaders import load_h5ad
from lamindb.core.storage._tiledbsoma import (
    SOMAS3ContextFactory,
    _open_tiledbsoma,
    _soma_store_n_observations,
)
from lamindb.integrations import save_tiledbsoma_experiment


@pytest.mark.parametrize("storage", [None, "s3://lamindb-test/tiledbsoma"])
def test_write_read_tiledbsoma(storage):
    if storage is not None:
        previous_storage = ln.setup.settings.storage.root_as_str
        ln.settings.storage = storage

    test_file = ln.examples.datasets.anndata_file_pbmc68k_test()
    adata = load_h5ad(test_file)
    # write less
    adata = adata[:5, :2].copy()
    del adata.varp
    del adata.obsp
    del adata.layers
    del adata.uns  # seems to cause problems for append
    if storage is None:
        # test local with zarr
        test_file = test_file.with_suffix(".zarr")
        adata.write_zarr(test_file)
    else:
        adata.write_h5ad(test_file)

    create_transform = ln.Transform(key="test create tiledbsoma store").save()
    create_run = ln.Run(create_transform).save()

    # fails with a view
    with pytest.raises(ValueError, match="Can not write an `AnnData` view"):
        save_tiledbsoma_experiment([adata[:2]], run=create_run, measurement_name="RNA")

    artifact_soma = save_tiledbsoma_experiment(
        [test_file],
        description="test tiledbsoma",
        key="scrna/my-big-dataset.tiledbsoma",  # can also be None, but that's trivial
        run=create_run,
        measurement_name="RNA",
    )
    assert artifact_soma.path.stem == artifact_soma.uid[:16]
    assert artifact_soma.key == "scrna/my-big-dataset.tiledbsoma"
    assert artifact_soma.suffix == ".tiledbsoma"
    assert artifact_soma._key_is_virtual
    assert artifact_soma.otype == "tiledbsoma"
    assert artifact_soma.n_observations == adata.n_obs

    with artifact_soma.open() as store:  # mode="r" by default
        assert isinstance(store, tiledbsoma.Experiment)
        obs = store["obs"]
        n_obs = len(obs)
        assert n_obs == adata.n_obs
        assert "lamin_run_uid" in obs.schema.names
        run_ids = (
            obs.read(column_names=["lamin_run_uid"])
            .concat()
            .to_pandas()["lamin_run_uid"]
        )
        assert all(run_ids == create_run.uid)
        assert set(run_ids.cat.categories) == {create_run.uid}
        # test reading X
        ms_rna = store.ms["RNA"]
        n_vars = len(ms_rna.var)
        assert n_vars == adata.n_vars
        X = ms_rna["X"]["data"].read().coos((n_obs, n_vars)).concat().to_scipy()
        assert X.sum() == adata.X.sum()

    cache_path = artifact_soma.cache()
    hash_before_changes = artifact_soma.hash
    with artifact_soma.open(mode="w") as store:
        assert store.__class__.__name__ == "ExperimentTrack"
        tiledbsoma.io.add_matrix_to_collection(
            exp=store,
            measurement_name="RNA",
            collection_name="obsm",
            matrix_name="test_array",
            matrix_data=np.ones((n_obs, 2)),
        )
    assert artifact_soma.hash != hash_before_changes
    assert artifact_soma.uid.endswith("0001")
    if storage is not None:
        # cache should be ignored and deleted after the changes
        assert not cache_path.exists()
    else:
        assert artifact_soma.path == cache_path

    adata_to_append_1 = adata[:3].copy()
    adata_to_append_1.obs["obs_id"] = adata_to_append_1.obs.index.to_numpy() + "***"
    adata_to_append_1.var["var_id"] = adata_to_append_1.var.index
    adata_to_append_2 = adata[3:5].copy()
    adata_to_append_2.obs["obs_id"] = adata_to_append_2.obs.index.to_numpy() + "***"
    adata_to_append_2.var["var_id"] = adata_to_append_2.var.index
    adata_to_append_2.write_h5ad("adata_to_append_2.h5ad")

    append_transform = ln.Transform(key="test append tiledbsoma store").save()
    append_run = ln.Run(append_transform).save()

    # here run should be passed
    with pytest.raises(ValueError, match="Pass `run`"):
        save_tiledbsoma_experiment(
            [adata_to_append_1],
            revises=artifact_soma,
            run=None,
            measurement_name="RNA",
        )

    artifact_soma_append = save_tiledbsoma_experiment(
        [adata_to_append_1, "adata_to_append_2.h5ad"],
        revises=artifact_soma,
        run=append_run,
        measurement_name="RNA",
        append_obsm_varm=True,
    )
    assert artifact_soma_append.uid.endswith("0002")
    artifact_soma.refresh_from_db()
    assert not artifact_soma.is_latest
    match = "its files were overwritten and are no longer available"
    with pytest.raises(ValueError, match=match):
        artifact_soma.open()
    with pytest.raises(ValueError, match=match):
        artifact_soma.load()
    with pytest.raises(ValueError, match=match):
        artifact_soma.cache()
    # below is inherited from "scrna/my-big-dataset.tiledbsoma"
    assert artifact_soma_append.key == "scrna/my-big-dataset.tiledbsoma"

    # wrong mode, should be either r or w for tiledbsoma
    with pytest.raises(ValueError):
        artifact_soma_append.open(mode="p")

    # test running without the context manager
    store = artifact_soma_append.open()
    n_obs_final = adata.n_obs + sum(
        adt.n_obs for adt in [adata_to_append_1, adata_to_append_2]
    )
    obs = store["obs"]
    assert len(obs) == n_obs_final == artifact_soma_append.n_observations
    run_ids = (
        obs.read(column_names=["lamin_run_uid"])
        .concat()
        .to_pandas()["lamin_run_uid"]
        .cat.categories
    )
    assert set(run_ids) == {create_run.uid, append_run.uid}
    store.close()

    # test correctness of deletion for _overwrite_versions=True
    soma_path = artifact_soma_append.path
    assert soma_path.exists()
    # select specific version and delete
    # check that the store is stil there
    assert soma_path.exists()
    assert ln.Artifact.filter(description="test tiledbsoma").count() == 3
    artifact_soma_append.versions.filter(uid__endswith="0001").one().delete(
        permanent=True
    )
    assert soma_path.exists()
    assert ln.Artifact.filter(description="test tiledbsoma").count() == 2
    # make sure it the store is actually deleted
    artifact_soma_append.delete(permanent=True)
    assert not soma_path.exists()
    assert not ln.Artifact.filter(description="test tiledbsoma").exists()

    Path("adata_to_append_2.h5ad").unlink()

    if storage is not None:
        ln.settings.storage = previous_storage


def test_from_tiledbsoma():
    test_file = ln.examples.datasets.anndata_file_pbmc68k_test()
    soma_path = "mystore.tiledbsoma"
    tiledbsoma.io.from_h5ad(soma_path, test_file, measurement_name="RNA")
    # wrong suffix
    with pytest.raises(ValueError):
        ln.Artifact.from_tiledbsoma("mystore")

    artifact = ln.Artifact.from_tiledbsoma(
        soma_path, description="test soma store"
    ).save()
    assert artifact.n_observations == 30

    with _open_tiledbsoma(artifact.path, mode="r") as store:
        # experiment
        assert _soma_store_n_observations(store) == 30
        # dataframe
        assert _soma_store_n_observations(store.obs) == 30
        # treat as unstructured collection, data + raw
        assert _soma_store_n_observations(store.ms) == 60
        # measurement
        assert _soma_store_n_observations(store.ms["RNA"]) == 30
        # array
        assert _soma_store_n_observations(store.ms["RNA"]["X"]["data"]) == 30

    artifact.delete(permanent=True)
    shutil.rmtree(soma_path)


def test_tiledb_config():
    storepath = ln.UPath("s3://bucket/key?endpoint_url=http://localhost:9000/s3")
    tiledb_config = SOMAS3ContextFactory(storepath).get_context().tiledb_config
    assert tiledb_config["vfs.s3.endpoint_override"] == "localhost:9000/s3"
    assert tiledb_config["vfs.s3.scheme"] == "http"
    assert tiledb_config["vfs.s3.use_virtual_addressing"] == "false"
    assert tiledb_config["vfs.s3.region"] == ""


def test_tiledbsoma_in_managed_storage():
    artifact = ln.Artifact.connect("laminlabs/lamindata").get(
        key="example_datasets/small_dataset1.tiledbsoma"
    )
    path = artifact.path
    assert "session" in path.storage_options

    ctx_factory = SOMAS3ContextFactory(path)
    assert ctx_factory._refreshable_credentials is not None

    ctx = ctx_factory.get_context()
    tiledb_config = ctx.tiledb_config
    assert "vfs.s3.aws_access_key_id" in tiledb_config
    assert "vfs.s3.aws_secret_access_key" in tiledb_config
    assert "vfs.s3.aws_session_token" in tiledb_config

    path_str = path.as_posix()
    # check with managed credentials
    with tiledbsoma.Experiment.open(path_str, mode="r", context=ctx) as store:
        assert _soma_store_n_observations(store) == 3
    # check with anon, s3://lamindata is public
    with _open_tiledbsoma(ln.UPath(path_str, anon=True), mode="r") as store:
        assert _soma_store_n_observations(store) == 3
    # pass credentials manually
    key = tiledb_config["vfs.s3.aws_access_key_id"]
    secret = tiledb_config["vfs.s3.aws_secret_access_key"]
    token = tiledb_config["vfs.s3.aws_session_token"]
    with _open_tiledbsoma(
        ln.UPath(path_str, key=key, secret=secret, token=token), mode="r"
    ) as store:
        assert _soma_store_n_observations(store) == 3